initial commit

This commit is contained in:
mappu 2023-05-20 18:48:35 +12:00
commit 959fca404e
8 changed files with 523 additions and 0 deletions

12
README.md Normal file
View File

@ -0,0 +1,12 @@
# Imgur Rescue Project
In April 2023 the website imgur.com [announced](https://help.imgur.com/hc/en-us/articles/14415587638029/) it may delete old data/images that are rarely accessed and were uploaded without an account.
This repository contains public versions of internal scripts used to extract imgur.com links from `yatwiki` and `archive`; download the images and metadata; and convert them to a `contented` database for ongoing read-only hosting.
The following tools are available:
- `archive-scrape`: extract imgur links from an [`archive`](https://code.ivysaur.me/archive) installation.
- `yatwiki-scrape`: extract imgur links from a [`yatwiki`](https://code.ivysaur.me/yatwiki) installation.
- `collect`: download imgur links and metadata, recursively following albums, with soft caching.
- `irp2bolt`: convert the resulting imgur links and metadata into a Bolt database for use with [`contented`](https://code.ivysaur.me/contented).

5
archive-scrape/scrape.sh Executable file
View File

@ -0,0 +1,5 @@
#!/bin/bash
set -eu
LC_ALL=C grep -RP 'http[^ ]+imgur\.com[^\b ]*' . -o -h | tr -d $'\r' | sed 's/http:/https:/' | sort | uniq > archive-urls.txt

158
collect/collect.php Executable file
View File

@ -0,0 +1,158 @@
#!/usr/bin/php
<?php
error_reporting(E_ALL);
define('IMGUR_CLIENT_ID', '546c25a59c58ad7'); // extract from any web interface pageview
function is_png($data) {
return (substr($data, 0, 4) === "\x89PNG");
}
function is_jpg($data) {
if (substr($data, 0, 3) === "\xFF\xD8\xFF") {
return true;
}
return str_contains(substr($data, 0, 64), "JFIF");
}
function is_gif($data) {
return (substr($data, 0, 4) === "GIF8");
}
function is_mp4($data) {
return str_contains(substr($data, 0, 64), "isom");
}
/**
* @param $type 'media' or 'albums'
*/
function imgur_mediainfo(string $code, string $type) {
// n.b. 'posts' is often a synonym for 'media' with the same result
// There is also a /meta endpoint possible after {$code}
$ret = file_get_contents("https://api.imgur.com/post/v1/{$type}/{$code}?client_id=" . IMGUR_CLIENT_ID . "&include=media");
if ($ret === false) {
throw new Exception("Failed API lookup for {$code} as {$type}");
}
if ($ret[0] !== '{') {
throw new Exception("API result for {$code} as {$type} got unexpected body: {$ret}");
}
$obj = json_decode($ret, true);
if (isset($obj['errors'])) {
throw new Exception("API result for {$code} as {$type} got unexpected body: {$ret}");
}
if ( ($type === 'albums') !== $obj['is_album'] ) {
throw new Exception("Unexpected type mismatch from API");
}
return $obj;
}
function imgur_download_single(string $code, string $type) {
echo "Downloading {$code}...\n";
if (file_exists("errors.{$type}/".$code)) {
echo "skipping (known error)\n";
return false;
}
if (! file_exists("metadata.{$type}/".$code)) {
try {
$metadata = imgur_mediainfo($code, $type);
} catch (Exception $ex) {
echo "WARNING: metadata download failed\n";
echo (string)$ex . "\n";
file_put_contents("errors.{$type}/".$code, json_encode([
"message" => $ex->getMessage(),
"failure_time" => date(DATE_RSS),
]));
return false;
}
file_put_contents("metadata.{$type}/".$code, json_encode($metadata));
} else {
$metadata = json_decode(file_get_contents("metadata.{$type}/".$code), true);
}
echo "code {$code} (type={$type}) contains ".$metadata['image_count']." media entries\n";
if ($metadata['image_count'] == 0) {
echo "WARNING: weird album with no images!\n";
return true;
}
foreach($metadata['media'] as $single) {
echo "- entry ".$single['id']."\n";
if (file_exists("images/".$single['id'])) {
echo "already exists (OK)\n";
continue;
}
// Download whole URL
$ret = file_get_contents($single['url']);
if ($ret === false) {
echo "download failed (WARNING)\n";
continue;
}
if (! is_png($ret) && ! is_jpg($ret) && ! is_gif($ret) && ! is_mp4($ret)) {
echo "unexpected result, not jpg/gif/png/mp4 (WARNING)\n";
file_put_contents("images/".$single['id'].".INVALID-NOT-AN-IMAGE", $ret);
continue;
}
if (strlen($ret) === 503 && md5($ret) == "d835884373f4d6c8f24742ceabe74946") {
echo "fake image result for image not found (WARNING)\n";
continue;
}
file_put_contents("images/".$single['id'], $ret);
}
// all done
return true;
}
function main() {
foreach(['images', 'metadata.albums', 'metadata.media', 'errors.albums', 'errors.media'] as $dirname) {
if (! is_dir($dirname)) {
mkdir($dirname);
}
}
$urls = explode("\n", file_get_contents("all-urls.txt"));
$matches = [];
foreach($urls as $url) {
if (preg_match("~^https://(i\\.|m\\.|img\\.|www\.)?imgur\\.com/([0-9a-zA-Z,]+)\\.?(jpg|jpeg|gif|webm|png|gifv|mp4)?(#.+)?$~", $url, $matches)) {
foreach(explode(',', $matches[2]) as $single_id) {
imgur_download_single($single_id, 'media');
}
} else if (preg_match("~^https://(i\\.|m\\.|img\\.|www\.)?imgur\\.com/(a/|gallery/|(?:r|t|topic)/[a-zA-Z0-9_]+/)?([0-9a-zA-Z,]+)\\.?(jpg|jpeg|gif|webm|png|gifv|mp4)?(#.+)?$~", $url, $matches)) {
imgur_download_single($matches[3], 'albums');
} else {
echo "WARNING: Unsupported URL: {$url}\n";
}
}
}
main();

32
collect/stats.sh Executable file
View File

@ -0,0 +1,32 @@
#!/bin/bash
echo "Current run:"
echo -n "- Downloading: "
fgrep 'Downloading' ./collect-logs.txt | wc -l
echo -n "- Unsupported: "
fgrep 'Unsupported' ./collect-logs.txt | wc -l
echo -n "- 404/missing: "
fgrep 'Failed API lookup' ./collect-logs.txt | wc -l
#fgrep '404 Not Found' ./collect-logs.txt | wc -l
#fgrep 'fake image result for image not found' ./collect-logs.txt | wc -l
echo "- Distribution:"
fgrep 'media entries' collect-logs.txt | cut -d' ' -f4- | sort | uniq -c
echo ""
echo "Full archive:"
echo -n "- Known URLs: "
cat all-urls.txt | wc -l
echo -n "- 404/missing: "
( ls errors.albums ; ls errors.media ) | wc -l
echo -n "- Saved images: "
ls images | wc -l
echo -n "- Saved metadata: "
( ls metadata.albums ; ls metadata.media ) | wc -l

8
irp2bolt/go.mod Normal file
View File

@ -0,0 +1,8 @@
module irp2bolt
go 1.19
require (
go.etcd.io/bbolt v1.3.7 // indirect
golang.org/x/sys v0.4.0 // indirect
)

4
irp2bolt/go.sum Normal file
View File

@ -0,0 +1,4 @@
go.etcd.io/bbolt v1.3.7 h1:j+zJOnnEjF/kyHlDDgGnVL/AIqIJPq8UoB2GSNfkUfQ=
go.etcd.io/bbolt v1.3.7/go.mod h1:N9Mkw9X8x5fupy0IKsmuqVtoGDyxsaDlbk4Rd05IAQw=
golang.org/x/sys v0.4.0 h1:Zr2JFtRQNX3BCZ8YtxRE9hNJYC8J6I1MVbMg6owUp18=
golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=

272
irp2bolt/main.go Normal file
View File

@ -0,0 +1,272 @@
package main
import (
"encoding/json"
"fmt"
"io/ioutil"
"log"
"os"
"time"
"go.etcd.io/bbolt"
)
type ImgurSingleMedia struct {
ID string `json:"id"`
Title string `json:"title"`
Description string `json:"description"`
MimeType string `json:"mime_type"`
CreatedAt time.Time `json:"created_at"` // e.g. 2013-10-28T02:37:02Z
Width int64 `json:"width"`
Height int64 `json:"height"`
Extension string `json:"ext"`
}
func (ism ImgurSingleMedia) InventName() string {
ret := ism.Title
if len(ism.Description) > 0 {
if len(ret) > 0 {
ret += " - "
}
ret += ism.Description
}
if len(ret) == 0 {
// No name/description in either gallery nor in first image
// Guess we just name it after the ID
ret = ism.ID
}
ret += "." + ism.Extension
return ret
}
type ImgurInfo struct {
ID string `json:"id"`
Title string `json:"title"`
Description string `json:"description"`
CreatedAt time.Time `json:"created_at"`
Media []ImgurSingleMedia `json:"media"`
}
func (i ImgurInfo) AlbumJson() []byte {
arr := make([]string, 0, len(i.Media))
for _, m := range i.Media {
arr = append(arr, m.ID)
}
bb, err := json.Marshal(arr)
if err != nil {
panic(err)
}
return bb
}
func (i ImgurInfo) InventName() string {
ret := i.Title
if len(i.Description) > 0 {
if len(ret) > 0 {
ret += " - "
}
ret += i.Description
}
if len(ret) > 0 {
return ret // Title + description is pretty good for an album
}
if len(i.Media) > 0 {
// Describe this album based on the first media instead
return i.Media[0].InventName()
}
// No name/description in either gallery nor in first image
// Guess we just name it after the ID
return i.ID
}
type ContentedMetadata struct {
FileHash string
FileSize int64
UploadTime time.Time
UploadIP string
Filename string
MimeType string
}
func main() {
db, err := bbolt.Open(fmt.Sprintf("output-%d.db", time.Now().Unix()), 0644, bbolt.DefaultOptions)
if err != nil {
panic(err)
}
err = db.Update(func(tx *bbolt.Tx) error {
bb, err := tx.CreateBucketIfNotExists([]byte(`METADATA`))
if err != nil {
panic(err)
}
//
// Media
//
media, err := os.ReadDir("../metadata.media")
if err != nil {
panic(err)
}
var addMediaCount int64 = 0
for _, mediaInfo := range media {
infoJson, err := ioutil.ReadFile("../metadata.media/" + mediaInfo.Name())
if err != nil {
panic(err)
}
var info ImgurInfo
err = json.Unmarshal(infoJson, &info)
if err != nil {
panic(err)
}
if len(info.Media) != 1 {
panic(err)
}
// Ensure image file exists
finfo, err := os.Stat("../images/" + mediaInfo.Name())
if err != nil {
log.Printf("Missing image %s for media %s, skipping", mediaInfo.Name(), mediaInfo.Name())
continue
// panic(err)
}
cinfoBytes, err := json.Marshal(ContentedMetadata{
FileHash: mediaInfo.Name(),
FileSize: finfo.Size(),
UploadTime: info.CreatedAt,
UploadIP: "n/a",
Filename: info.InventName(),
MimeType: info.Media[0].MimeType,
})
if err != nil {
panic(err)
}
err = bb.Put([]byte(mediaInfo.Name()), cinfoBytes)
if err != nil {
panic(err)
}
addMediaCount += 1
}
log.Printf("Added %d media entries OK", addMediaCount)
//
// Albums
//
albums, err := os.ReadDir("../metadata.albums")
if err != nil {
panic(err)
}
var addAlbumCount int64 = 0
var addAlbumMediaCount int64 = 0
var albumsWithNoImagesCount int64 = 0
for _, albuminfo := range albums {
infoJson, err := ioutil.ReadFile("../metadata.albums/" + albuminfo.Name())
if err != nil {
panic(err)
}
var info ImgurInfo
err = json.Unmarshal(infoJson, &info)
if err != nil {
panic(err)
}
if len(info.Media) == 0 {
log.Printf("Album '%s' contains no images, allowing anyway", albuminfo.Name())
albumsWithNoImagesCount += 1
}
// Add gallery entries for each of the media elements
for _, mediaInfo := range info.Media {
// Ensure image file exists
finfo, err := os.Stat("../images/" + mediaInfo.ID)
if err != nil {
log.Printf("Missing image %s for album %s, skipping", mediaInfo.ID, albuminfo.Name())
continue
// panic(err)
}
cinfoBytes, err := json.Marshal(ContentedMetadata{
FileHash: mediaInfo.ID,
FileSize: finfo.Size(),
UploadTime: info.CreatedAt,
UploadIP: "n/a",
Filename: mediaInfo.InventName(),
MimeType: mediaInfo.MimeType,
})
if err != nil {
panic(err)
}
err = bb.Put([]byte(mediaInfo.ID), cinfoBytes)
if err != nil {
panic(err)
}
addAlbumMediaCount += 1
}
// Add album entry for the overall album
albumHash := `a/` + albuminfo.Name() // Use a/ prefix. This can't naturally happen in contented's filehash algorithm
albumJson := info.AlbumJson()
err = ioutil.WriteFile(albumHash, albumJson, 0644) // a/ subdirectory must exist
if err != nil {
panic(err)
}
cinfoBytes, err := json.Marshal(ContentedMetadata{
FileHash: albumHash,
FileSize: int64(len(albumJson)),
UploadTime: info.CreatedAt,
UploadIP: "n/a",
Filename: info.InventName(),
MimeType: "contented/album",
})
if err != nil {
panic(err)
}
err = bb.Put([]byte(albumHash), cinfoBytes)
if err != nil {
panic(err)
}
addAlbumCount += 1
}
log.Printf("Added %d album entries OK with %d additional image entries", addAlbumCount, addMediaCount)
log.Printf("There are %d albums with no images", albumsWithNoImagesCount)
// Fully imported
return nil
})
if err != nil {
panic(err)
}
}

View File

@ -0,0 +1,32 @@
#!/usr/bin/php
<?php
$db = new \PDO("sqlite:wiki.db");
$matches = [];
$links = [];
foreach($db->query('SELECT id, body FROM articles') as $article) {
$body = gzinflate($article['body']);
preg_match_all('~\[imgur\](.+?)\[~', $body, $matches);
if (count($matches)) {
foreach($matches[1] as $short) {
$links[] = 'https://i.imgur.com/'.$short;
}
}
// Inline links
preg_match_all('~https?://[^ \t\n"><\]\[]+imgur.com[^ \t\n"><\]\[]*~', $body, $matches);
if (count($matches)) {
foreach($matches[0] as $link) {
$links[] = $link;
}
}
}
// Output
foreach($links as $link) {
echo str_replace("https://i.imgur.com/http://i.imgur.com", "https://i.imgur.com", $link)."\n";
}