From 959fca404e5461520bdb13cd0ce4df91e1891506 Mon Sep 17 00:00:00 2001 From: mappu Date: Sat, 20 May 2023 18:48:35 +1200 Subject: [PATCH] initial commit --- README.md | 12 ++ archive-scrape/scrape.sh | 5 + collect/collect.php | 158 ++++++++++++++++++ collect/stats.sh | 32 ++++ irp2bolt/go.mod | 8 + irp2bolt/go.sum | 4 + irp2bolt/main.go | 272 +++++++++++++++++++++++++++++++ yatwiki-scrape/scrape-wikidb.php | 32 ++++ 8 files changed, 523 insertions(+) create mode 100644 README.md create mode 100755 archive-scrape/scrape.sh create mode 100755 collect/collect.php create mode 100755 collect/stats.sh create mode 100644 irp2bolt/go.mod create mode 100644 irp2bolt/go.sum create mode 100644 irp2bolt/main.go create mode 100644 yatwiki-scrape/scrape-wikidb.php diff --git a/README.md b/README.md new file mode 100644 index 0000000..0ad72b2 --- /dev/null +++ b/README.md @@ -0,0 +1,12 @@ +# Imgur Rescue Project + +In April 2023 the website imgur.com [announced](https://help.imgur.com/hc/en-us/articles/14415587638029/) it may delete old data/images that are rarely accessed and were uploaded without an account. + +This repository contains public versions of internal scripts used to extract imgur.com links from `yatwiki` and `archive`; download the images and metadata; and convert them to a `contented` database for ongoing read-only hosting. + +The following tools are available: + +- `archive-scrape`: extract imgur links from an [`archive`](https://code.ivysaur.me/archive) installation. +- `yatwiki-scrape`: extract imgur links from a [`yatwiki`](https://code.ivysaur.me/yatwiki) installation. +- `collect`: download imgur links and metadata, recursively following albums, with soft caching. +- `irp2bolt`: convert the resulting imgur links and metadata into a Bolt database for use with [`contented`](https://code.ivysaur.me/contented). diff --git a/archive-scrape/scrape.sh b/archive-scrape/scrape.sh new file mode 100755 index 0000000..2a73e76 --- /dev/null +++ b/archive-scrape/scrape.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +set -eu + +LC_ALL=C grep -RP 'http[^ ]+imgur\.com[^\b ]*' . -o -h | tr -d $'\r' | sed 's/http:/https:/' | sort | uniq > archive-urls.txt diff --git a/collect/collect.php b/collect/collect.php new file mode 100755 index 0000000..fbefb3d --- /dev/null +++ b/collect/collect.php @@ -0,0 +1,158 @@ +#!/usr/bin/php + $ex->getMessage(), + "failure_time" => date(DATE_RSS), + ])); + + return false; + } + + file_put_contents("metadata.{$type}/".$code, json_encode($metadata)); + + } else { + $metadata = json_decode(file_get_contents("metadata.{$type}/".$code), true); + } + + echo "code {$code} (type={$type}) contains ".$metadata['image_count']." media entries\n"; + if ($metadata['image_count'] == 0) { + echo "WARNING: weird album with no images!\n"; + return true; + } + + foreach($metadata['media'] as $single) { + + echo "- entry ".$single['id']."\n"; + + if (file_exists("images/".$single['id'])) { + echo "already exists (OK)\n"; + continue; + } + + // Download whole URL + $ret = file_get_contents($single['url']); + if ($ret === false) { + echo "download failed (WARNING)\n"; + continue; + } + + if (! is_png($ret) && ! is_jpg($ret) && ! is_gif($ret) && ! is_mp4($ret)) { + echo "unexpected result, not jpg/gif/png/mp4 (WARNING)\n"; + file_put_contents("images/".$single['id'].".INVALID-NOT-AN-IMAGE", $ret); + continue; + } + + if (strlen($ret) === 503 && md5($ret) == "d835884373f4d6c8f24742ceabe74946") { + echo "fake image result for image not found (WARNING)\n"; + continue; + } + + file_put_contents("images/".$single['id'], $ret); + } + + // all done + return true; +} + +function main() { + + foreach(['images', 'metadata.albums', 'metadata.media', 'errors.albums', 'errors.media'] as $dirname) { + if (! is_dir($dirname)) { + mkdir($dirname); + } + } + + $urls = explode("\n", file_get_contents("all-urls.txt")); + $matches = []; + + foreach($urls as $url) { + + if (preg_match("~^https://(i\\.|m\\.|img\\.|www\.)?imgur\\.com/([0-9a-zA-Z,]+)\\.?(jpg|jpeg|gif|webm|png|gifv|mp4)?(#.+)?$~", $url, $matches)) { + + foreach(explode(',', $matches[2]) as $single_id) { + imgur_download_single($single_id, 'media'); + } + + } else if (preg_match("~^https://(i\\.|m\\.|img\\.|www\.)?imgur\\.com/(a/|gallery/|(?:r|t|topic)/[a-zA-Z0-9_]+/)?([0-9a-zA-Z,]+)\\.?(jpg|jpeg|gif|webm|png|gifv|mp4)?(#.+)?$~", $url, $matches)) { + + imgur_download_single($matches[3], 'albums'); + + } else { + echo "WARNING: Unsupported URL: {$url}\n"; + + } + + } + +} + +main(); diff --git a/collect/stats.sh b/collect/stats.sh new file mode 100755 index 0000000..23d1dba --- /dev/null +++ b/collect/stats.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +echo "Current run:" + +echo -n "- Downloading: " +fgrep 'Downloading' ./collect-logs.txt | wc -l + +echo -n "- Unsupported: " +fgrep 'Unsupported' ./collect-logs.txt | wc -l + +echo -n "- 404/missing: " +fgrep 'Failed API lookup' ./collect-logs.txt | wc -l +#fgrep '404 Not Found' ./collect-logs.txt | wc -l +#fgrep 'fake image result for image not found' ./collect-logs.txt | wc -l + +echo "- Distribution:" +fgrep 'media entries' collect-logs.txt | cut -d' ' -f4- | sort | uniq -c + +echo "" +echo "Full archive:" + +echo -n "- Known URLs: " +cat all-urls.txt | wc -l + +echo -n "- 404/missing: " +( ls errors.albums ; ls errors.media ) | wc -l + +echo -n "- Saved images: " +ls images | wc -l + +echo -n "- Saved metadata: " +( ls metadata.albums ; ls metadata.media ) | wc -l diff --git a/irp2bolt/go.mod b/irp2bolt/go.mod new file mode 100644 index 0000000..e66ed5c --- /dev/null +++ b/irp2bolt/go.mod @@ -0,0 +1,8 @@ +module irp2bolt + +go 1.19 + +require ( + go.etcd.io/bbolt v1.3.7 // indirect + golang.org/x/sys v0.4.0 // indirect +) diff --git a/irp2bolt/go.sum b/irp2bolt/go.sum new file mode 100644 index 0000000..0fc529b --- /dev/null +++ b/irp2bolt/go.sum @@ -0,0 +1,4 @@ +go.etcd.io/bbolt v1.3.7 h1:j+zJOnnEjF/kyHlDDgGnVL/AIqIJPq8UoB2GSNfkUfQ= +go.etcd.io/bbolt v1.3.7/go.mod h1:N9Mkw9X8x5fupy0IKsmuqVtoGDyxsaDlbk4Rd05IAQw= +golang.org/x/sys v0.4.0 h1:Zr2JFtRQNX3BCZ8YtxRE9hNJYC8J6I1MVbMg6owUp18= +golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/irp2bolt/main.go b/irp2bolt/main.go new file mode 100644 index 0000000..a4073b0 --- /dev/null +++ b/irp2bolt/main.go @@ -0,0 +1,272 @@ +package main + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "log" + "os" + "time" + + "go.etcd.io/bbolt" +) + +type ImgurSingleMedia struct { + ID string `json:"id"` + Title string `json:"title"` + Description string `json:"description"` + MimeType string `json:"mime_type"` + CreatedAt time.Time `json:"created_at"` // e.g. 2013-10-28T02:37:02Z + + Width int64 `json:"width"` + Height int64 `json:"height"` + Extension string `json:"ext"` +} + +func (ism ImgurSingleMedia) InventName() string { + ret := ism.Title + if len(ism.Description) > 0 { + if len(ret) > 0 { + ret += " - " + } + ret += ism.Description + } + + if len(ret) == 0 { + // No name/description in either gallery nor in first image + // Guess we just name it after the ID + ret = ism.ID + } + + ret += "." + ism.Extension + + return ret +} + +type ImgurInfo struct { + ID string `json:"id"` + Title string `json:"title"` + Description string `json:"description"` + CreatedAt time.Time `json:"created_at"` + Media []ImgurSingleMedia `json:"media"` +} + +func (i ImgurInfo) AlbumJson() []byte { + arr := make([]string, 0, len(i.Media)) + for _, m := range i.Media { + arr = append(arr, m.ID) + } + + bb, err := json.Marshal(arr) + if err != nil { + panic(err) + } + + return bb +} + +func (i ImgurInfo) InventName() string { + ret := i.Title + + if len(i.Description) > 0 { + if len(ret) > 0 { + ret += " - " + } + ret += i.Description + } + + if len(ret) > 0 { + return ret // Title + description is pretty good for an album + } + + if len(i.Media) > 0 { + // Describe this album based on the first media instead + return i.Media[0].InventName() + } + + // No name/description in either gallery nor in first image + // Guess we just name it after the ID + return i.ID +} + +type ContentedMetadata struct { + FileHash string + FileSize int64 + UploadTime time.Time + UploadIP string + Filename string + MimeType string +} + +func main() { + db, err := bbolt.Open(fmt.Sprintf("output-%d.db", time.Now().Unix()), 0644, bbolt.DefaultOptions) + if err != nil { + panic(err) + } + + err = db.Update(func(tx *bbolt.Tx) error { + + bb, err := tx.CreateBucketIfNotExists([]byte(`METADATA`)) + if err != nil { + panic(err) + } + + // + // Media + // + + media, err := os.ReadDir("../metadata.media") + if err != nil { + panic(err) + } + + var addMediaCount int64 = 0 + + for _, mediaInfo := range media { + infoJson, err := ioutil.ReadFile("../metadata.media/" + mediaInfo.Name()) + if err != nil { + panic(err) + } + + var info ImgurInfo + err = json.Unmarshal(infoJson, &info) + if err != nil { + panic(err) + } + + if len(info.Media) != 1 { + panic(err) + } + + // Ensure image file exists + finfo, err := os.Stat("../images/" + mediaInfo.Name()) + if err != nil { + log.Printf("Missing image %s for media %s, skipping", mediaInfo.Name(), mediaInfo.Name()) + continue + // panic(err) + } + + cinfoBytes, err := json.Marshal(ContentedMetadata{ + FileHash: mediaInfo.Name(), + FileSize: finfo.Size(), + UploadTime: info.CreatedAt, + UploadIP: "n/a", + Filename: info.InventName(), + MimeType: info.Media[0].MimeType, + }) + if err != nil { + panic(err) + } + + err = bb.Put([]byte(mediaInfo.Name()), cinfoBytes) + if err != nil { + panic(err) + } + + addMediaCount += 1 + } + + log.Printf("Added %d media entries OK", addMediaCount) + + // + // Albums + // + + albums, err := os.ReadDir("../metadata.albums") + if err != nil { + panic(err) + } + + var addAlbumCount int64 = 0 + var addAlbumMediaCount int64 = 0 + var albumsWithNoImagesCount int64 = 0 + + for _, albuminfo := range albums { + infoJson, err := ioutil.ReadFile("../metadata.albums/" + albuminfo.Name()) + if err != nil { + panic(err) + } + + var info ImgurInfo + err = json.Unmarshal(infoJson, &info) + if err != nil { + panic(err) + } + + if len(info.Media) == 0 { + log.Printf("Album '%s' contains no images, allowing anyway", albuminfo.Name()) + albumsWithNoImagesCount += 1 + } + + // Add gallery entries for each of the media elements + + for _, mediaInfo := range info.Media { + + // Ensure image file exists + finfo, err := os.Stat("../images/" + mediaInfo.ID) + if err != nil { + log.Printf("Missing image %s for album %s, skipping", mediaInfo.ID, albuminfo.Name()) + continue + // panic(err) + } + + cinfoBytes, err := json.Marshal(ContentedMetadata{ + FileHash: mediaInfo.ID, + FileSize: finfo.Size(), + UploadTime: info.CreatedAt, + UploadIP: "n/a", + Filename: mediaInfo.InventName(), + MimeType: mediaInfo.MimeType, + }) + if err != nil { + panic(err) + } + + err = bb.Put([]byte(mediaInfo.ID), cinfoBytes) + if err != nil { + panic(err) + } + + addAlbumMediaCount += 1 + + } + + // Add album entry for the overall album + albumHash := `a/` + albuminfo.Name() // Use a/ prefix. This can't naturally happen in contented's filehash algorithm + albumJson := info.AlbumJson() + err = ioutil.WriteFile(albumHash, albumJson, 0644) // a/ subdirectory must exist + if err != nil { + panic(err) + } + + cinfoBytes, err := json.Marshal(ContentedMetadata{ + FileHash: albumHash, + FileSize: int64(len(albumJson)), + UploadTime: info.CreatedAt, + UploadIP: "n/a", + Filename: info.InventName(), + MimeType: "contented/album", + }) + if err != nil { + panic(err) + } + + err = bb.Put([]byte(albumHash), cinfoBytes) + if err != nil { + panic(err) + } + + addAlbumCount += 1 + } + + log.Printf("Added %d album entries OK with %d additional image entries", addAlbumCount, addMediaCount) + + log.Printf("There are %d albums with no images", albumsWithNoImagesCount) + + // Fully imported + return nil + }) + if err != nil { + panic(err) + } +} diff --git a/yatwiki-scrape/scrape-wikidb.php b/yatwiki-scrape/scrape-wikidb.php new file mode 100644 index 0000000..90e3952 --- /dev/null +++ b/yatwiki-scrape/scrape-wikidb.php @@ -0,0 +1,32 @@ +#!/usr/bin/php +query('SELECT id, body FROM articles') as $article) { + $body = gzinflate($article['body']); + + preg_match_all('~\[imgur\](.+?)\[~', $body, $matches); + + if (count($matches)) { + foreach($matches[1] as $short) { + $links[] = 'https://i.imgur.com/'.$short; + } + } + + // Inline links + preg_match_all('~https?://[^ \t\n"><\]\[]+imgur.com[^ \t\n"><\]\[]*~', $body, $matches); + if (count($matches)) { + foreach($matches[0] as $link) { + $links[] = $link; + } + } +} + +// Output +foreach($links as $link) { + echo str_replace("https://i.imgur.com/http://i.imgur.com", "https://i.imgur.com", $link)."\n"; +}