initial commit
This commit is contained in:
commit
959fca404e
12
README.md
Normal file
12
README.md
Normal file
@ -0,0 +1,12 @@
|
||||
# Imgur Rescue Project
|
||||
|
||||
In April 2023 the website imgur.com [announced](https://help.imgur.com/hc/en-us/articles/14415587638029/) it may delete old data/images that are rarely accessed and were uploaded without an account.
|
||||
|
||||
This repository contains public versions of internal scripts used to extract imgur.com links from `yatwiki` and `archive`; download the images and metadata; and convert them to a `contented` database for ongoing read-only hosting.
|
||||
|
||||
The following tools are available:
|
||||
|
||||
- `archive-scrape`: extract imgur links from an [`archive`](https://code.ivysaur.me/archive) installation.
|
||||
- `yatwiki-scrape`: extract imgur links from a [`yatwiki`](https://code.ivysaur.me/yatwiki) installation.
|
||||
- `collect`: download imgur links and metadata, recursively following albums, with soft caching.
|
||||
- `irp2bolt`: convert the resulting imgur links and metadata into a Bolt database for use with [`contented`](https://code.ivysaur.me/contented).
|
5
archive-scrape/scrape.sh
Executable file
5
archive-scrape/scrape.sh
Executable file
@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -eu
|
||||
|
||||
LC_ALL=C grep -RP 'http[^ ]+imgur\.com[^\b ]*' . -o -h | tr -d $'\r' | sed 's/http:/https:/' | sort | uniq > archive-urls.txt
|
158
collect/collect.php
Executable file
158
collect/collect.php
Executable file
@ -0,0 +1,158 @@
|
||||
#!/usr/bin/php
|
||||
<?php
|
||||
|
||||
error_reporting(E_ALL);
|
||||
define('IMGUR_CLIENT_ID', '546c25a59c58ad7'); // extract from any web interface pageview
|
||||
|
||||
function is_png($data) {
|
||||
return (substr($data, 0, 4) === "\x89PNG");
|
||||
}
|
||||
|
||||
function is_jpg($data) {
|
||||
if (substr($data, 0, 3) === "\xFF\xD8\xFF") {
|
||||
return true;
|
||||
}
|
||||
|
||||
return str_contains(substr($data, 0, 64), "JFIF");
|
||||
}
|
||||
|
||||
function is_gif($data) {
|
||||
return (substr($data, 0, 4) === "GIF8");
|
||||
}
|
||||
|
||||
function is_mp4($data) {
|
||||
return str_contains(substr($data, 0, 64), "isom");
|
||||
}
|
||||
|
||||
/**
|
||||
* @param $type 'media' or 'albums'
|
||||
*/
|
||||
function imgur_mediainfo(string $code, string $type) {
|
||||
|
||||
// n.b. 'posts' is often a synonym for 'media' with the same result
|
||||
// There is also a /meta endpoint possible after {$code}
|
||||
|
||||
$ret = file_get_contents("https://api.imgur.com/post/v1/{$type}/{$code}?client_id=" . IMGUR_CLIENT_ID . "&include=media");
|
||||
if ($ret === false) {
|
||||
throw new Exception("Failed API lookup for {$code} as {$type}");
|
||||
}
|
||||
|
||||
if ($ret[0] !== '{') {
|
||||
throw new Exception("API result for {$code} as {$type} got unexpected body: {$ret}");
|
||||
}
|
||||
|
||||
$obj = json_decode($ret, true);
|
||||
if (isset($obj['errors'])) {
|
||||
throw new Exception("API result for {$code} as {$type} got unexpected body: {$ret}");
|
||||
}
|
||||
|
||||
if ( ($type === 'albums') !== $obj['is_album'] ) {
|
||||
throw new Exception("Unexpected type mismatch from API");
|
||||
}
|
||||
|
||||
return $obj;
|
||||
}
|
||||
|
||||
function imgur_download_single(string $code, string $type) {
|
||||
echo "Downloading {$code}...\n";
|
||||
|
||||
if (file_exists("errors.{$type}/".$code)) {
|
||||
echo "skipping (known error)\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (! file_exists("metadata.{$type}/".$code)) {
|
||||
try {
|
||||
$metadata = imgur_mediainfo($code, $type);
|
||||
|
||||
} catch (Exception $ex) {
|
||||
echo "WARNING: metadata download failed\n";
|
||||
echo (string)$ex . "\n";
|
||||
|
||||
file_put_contents("errors.{$type}/".$code, json_encode([
|
||||
"message" => $ex->getMessage(),
|
||||
"failure_time" => date(DATE_RSS),
|
||||
]));
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
file_put_contents("metadata.{$type}/".$code, json_encode($metadata));
|
||||
|
||||
} else {
|
||||
$metadata = json_decode(file_get_contents("metadata.{$type}/".$code), true);
|
||||
}
|
||||
|
||||
echo "code {$code} (type={$type}) contains ".$metadata['image_count']." media entries\n";
|
||||
if ($metadata['image_count'] == 0) {
|
||||
echo "WARNING: weird album with no images!\n";
|
||||
return true;
|
||||
}
|
||||
|
||||
foreach($metadata['media'] as $single) {
|
||||
|
||||
echo "- entry ".$single['id']."\n";
|
||||
|
||||
if (file_exists("images/".$single['id'])) {
|
||||
echo "already exists (OK)\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
// Download whole URL
|
||||
$ret = file_get_contents($single['url']);
|
||||
if ($ret === false) {
|
||||
echo "download failed (WARNING)\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
if (! is_png($ret) && ! is_jpg($ret) && ! is_gif($ret) && ! is_mp4($ret)) {
|
||||
echo "unexpected result, not jpg/gif/png/mp4 (WARNING)\n";
|
||||
file_put_contents("images/".$single['id'].".INVALID-NOT-AN-IMAGE", $ret);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (strlen($ret) === 503 && md5($ret) == "d835884373f4d6c8f24742ceabe74946") {
|
||||
echo "fake image result for image not found (WARNING)\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
file_put_contents("images/".$single['id'], $ret);
|
||||
}
|
||||
|
||||
// all done
|
||||
return true;
|
||||
}
|
||||
|
||||
function main() {
|
||||
|
||||
foreach(['images', 'metadata.albums', 'metadata.media', 'errors.albums', 'errors.media'] as $dirname) {
|
||||
if (! is_dir($dirname)) {
|
||||
mkdir($dirname);
|
||||
}
|
||||
}
|
||||
|
||||
$urls = explode("\n", file_get_contents("all-urls.txt"));
|
||||
$matches = [];
|
||||
|
||||
foreach($urls as $url) {
|
||||
|
||||
if (preg_match("~^https://(i\\.|m\\.|img\\.|www\.)?imgur\\.com/([0-9a-zA-Z,]+)\\.?(jpg|jpeg|gif|webm|png|gifv|mp4)?(#.+)?$~", $url, $matches)) {
|
||||
|
||||
foreach(explode(',', $matches[2]) as $single_id) {
|
||||
imgur_download_single($single_id, 'media');
|
||||
}
|
||||
|
||||
} else if (preg_match("~^https://(i\\.|m\\.|img\\.|www\.)?imgur\\.com/(a/|gallery/|(?:r|t|topic)/[a-zA-Z0-9_]+/)?([0-9a-zA-Z,]+)\\.?(jpg|jpeg|gif|webm|png|gifv|mp4)?(#.+)?$~", $url, $matches)) {
|
||||
|
||||
imgur_download_single($matches[3], 'albums');
|
||||
|
||||
} else {
|
||||
echo "WARNING: Unsupported URL: {$url}\n";
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
main();
|
32
collect/stats.sh
Executable file
32
collect/stats.sh
Executable file
@ -0,0 +1,32 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "Current run:"
|
||||
|
||||
echo -n "- Downloading: "
|
||||
fgrep 'Downloading' ./collect-logs.txt | wc -l
|
||||
|
||||
echo -n "- Unsupported: "
|
||||
fgrep 'Unsupported' ./collect-logs.txt | wc -l
|
||||
|
||||
echo -n "- 404/missing: "
|
||||
fgrep 'Failed API lookup' ./collect-logs.txt | wc -l
|
||||
#fgrep '404 Not Found' ./collect-logs.txt | wc -l
|
||||
#fgrep 'fake image result for image not found' ./collect-logs.txt | wc -l
|
||||
|
||||
echo "- Distribution:"
|
||||
fgrep 'media entries' collect-logs.txt | cut -d' ' -f4- | sort | uniq -c
|
||||
|
||||
echo ""
|
||||
echo "Full archive:"
|
||||
|
||||
echo -n "- Known URLs: "
|
||||
cat all-urls.txt | wc -l
|
||||
|
||||
echo -n "- 404/missing: "
|
||||
( ls errors.albums ; ls errors.media ) | wc -l
|
||||
|
||||
echo -n "- Saved images: "
|
||||
ls images | wc -l
|
||||
|
||||
echo -n "- Saved metadata: "
|
||||
( ls metadata.albums ; ls metadata.media ) | wc -l
|
8
irp2bolt/go.mod
Normal file
8
irp2bolt/go.mod
Normal file
@ -0,0 +1,8 @@
|
||||
module irp2bolt
|
||||
|
||||
go 1.19
|
||||
|
||||
require (
|
||||
go.etcd.io/bbolt v1.3.7 // indirect
|
||||
golang.org/x/sys v0.4.0 // indirect
|
||||
)
|
4
irp2bolt/go.sum
Normal file
4
irp2bolt/go.sum
Normal file
@ -0,0 +1,4 @@
|
||||
go.etcd.io/bbolt v1.3.7 h1:j+zJOnnEjF/kyHlDDgGnVL/AIqIJPq8UoB2GSNfkUfQ=
|
||||
go.etcd.io/bbolt v1.3.7/go.mod h1:N9Mkw9X8x5fupy0IKsmuqVtoGDyxsaDlbk4Rd05IAQw=
|
||||
golang.org/x/sys v0.4.0 h1:Zr2JFtRQNX3BCZ8YtxRE9hNJYC8J6I1MVbMg6owUp18=
|
||||
golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
272
irp2bolt/main.go
Normal file
272
irp2bolt/main.go
Normal file
@ -0,0 +1,272 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"go.etcd.io/bbolt"
|
||||
)
|
||||
|
||||
type ImgurSingleMedia struct {
|
||||
ID string `json:"id"`
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
MimeType string `json:"mime_type"`
|
||||
CreatedAt time.Time `json:"created_at"` // e.g. 2013-10-28T02:37:02Z
|
||||
|
||||
Width int64 `json:"width"`
|
||||
Height int64 `json:"height"`
|
||||
Extension string `json:"ext"`
|
||||
}
|
||||
|
||||
func (ism ImgurSingleMedia) InventName() string {
|
||||
ret := ism.Title
|
||||
if len(ism.Description) > 0 {
|
||||
if len(ret) > 0 {
|
||||
ret += " - "
|
||||
}
|
||||
ret += ism.Description
|
||||
}
|
||||
|
||||
if len(ret) == 0 {
|
||||
// No name/description in either gallery nor in first image
|
||||
// Guess we just name it after the ID
|
||||
ret = ism.ID
|
||||
}
|
||||
|
||||
ret += "." + ism.Extension
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
type ImgurInfo struct {
|
||||
ID string `json:"id"`
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
Media []ImgurSingleMedia `json:"media"`
|
||||
}
|
||||
|
||||
func (i ImgurInfo) AlbumJson() []byte {
|
||||
arr := make([]string, 0, len(i.Media))
|
||||
for _, m := range i.Media {
|
||||
arr = append(arr, m.ID)
|
||||
}
|
||||
|
||||
bb, err := json.Marshal(arr)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
return bb
|
||||
}
|
||||
|
||||
func (i ImgurInfo) InventName() string {
|
||||
ret := i.Title
|
||||
|
||||
if len(i.Description) > 0 {
|
||||
if len(ret) > 0 {
|
||||
ret += " - "
|
||||
}
|
||||
ret += i.Description
|
||||
}
|
||||
|
||||
if len(ret) > 0 {
|
||||
return ret // Title + description is pretty good for an album
|
||||
}
|
||||
|
||||
if len(i.Media) > 0 {
|
||||
// Describe this album based on the first media instead
|
||||
return i.Media[0].InventName()
|
||||
}
|
||||
|
||||
// No name/description in either gallery nor in first image
|
||||
// Guess we just name it after the ID
|
||||
return i.ID
|
||||
}
|
||||
|
||||
type ContentedMetadata struct {
|
||||
FileHash string
|
||||
FileSize int64
|
||||
UploadTime time.Time
|
||||
UploadIP string
|
||||
Filename string
|
||||
MimeType string
|
||||
}
|
||||
|
||||
func main() {
|
||||
db, err := bbolt.Open(fmt.Sprintf("output-%d.db", time.Now().Unix()), 0644, bbolt.DefaultOptions)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
err = db.Update(func(tx *bbolt.Tx) error {
|
||||
|
||||
bb, err := tx.CreateBucketIfNotExists([]byte(`METADATA`))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
//
|
||||
// Media
|
||||
//
|
||||
|
||||
media, err := os.ReadDir("../metadata.media")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
var addMediaCount int64 = 0
|
||||
|
||||
for _, mediaInfo := range media {
|
||||
infoJson, err := ioutil.ReadFile("../metadata.media/" + mediaInfo.Name())
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
var info ImgurInfo
|
||||
err = json.Unmarshal(infoJson, &info)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if len(info.Media) != 1 {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// Ensure image file exists
|
||||
finfo, err := os.Stat("../images/" + mediaInfo.Name())
|
||||
if err != nil {
|
||||
log.Printf("Missing image %s for media %s, skipping", mediaInfo.Name(), mediaInfo.Name())
|
||||
continue
|
||||
// panic(err)
|
||||
}
|
||||
|
||||
cinfoBytes, err := json.Marshal(ContentedMetadata{
|
||||
FileHash: mediaInfo.Name(),
|
||||
FileSize: finfo.Size(),
|
||||
UploadTime: info.CreatedAt,
|
||||
UploadIP: "n/a",
|
||||
Filename: info.InventName(),
|
||||
MimeType: info.Media[0].MimeType,
|
||||
})
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
err = bb.Put([]byte(mediaInfo.Name()), cinfoBytes)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
addMediaCount += 1
|
||||
}
|
||||
|
||||
log.Printf("Added %d media entries OK", addMediaCount)
|
||||
|
||||
//
|
||||
// Albums
|
||||
//
|
||||
|
||||
albums, err := os.ReadDir("../metadata.albums")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
var addAlbumCount int64 = 0
|
||||
var addAlbumMediaCount int64 = 0
|
||||
var albumsWithNoImagesCount int64 = 0
|
||||
|
||||
for _, albuminfo := range albums {
|
||||
infoJson, err := ioutil.ReadFile("../metadata.albums/" + albuminfo.Name())
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
var info ImgurInfo
|
||||
err = json.Unmarshal(infoJson, &info)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if len(info.Media) == 0 {
|
||||
log.Printf("Album '%s' contains no images, allowing anyway", albuminfo.Name())
|
||||
albumsWithNoImagesCount += 1
|
||||
}
|
||||
|
||||
// Add gallery entries for each of the media elements
|
||||
|
||||
for _, mediaInfo := range info.Media {
|
||||
|
||||
// Ensure image file exists
|
||||
finfo, err := os.Stat("../images/" + mediaInfo.ID)
|
||||
if err != nil {
|
||||
log.Printf("Missing image %s for album %s, skipping", mediaInfo.ID, albuminfo.Name())
|
||||
continue
|
||||
// panic(err)
|
||||
}
|
||||
|
||||
cinfoBytes, err := json.Marshal(ContentedMetadata{
|
||||
FileHash: mediaInfo.ID,
|
||||
FileSize: finfo.Size(),
|
||||
UploadTime: info.CreatedAt,
|
||||
UploadIP: "n/a",
|
||||
Filename: mediaInfo.InventName(),
|
||||
MimeType: mediaInfo.MimeType,
|
||||
})
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
err = bb.Put([]byte(mediaInfo.ID), cinfoBytes)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
addAlbumMediaCount += 1
|
||||
|
||||
}
|
||||
|
||||
// Add album entry for the overall album
|
||||
albumHash := `a/` + albuminfo.Name() // Use a/ prefix. This can't naturally happen in contented's filehash algorithm
|
||||
albumJson := info.AlbumJson()
|
||||
err = ioutil.WriteFile(albumHash, albumJson, 0644) // a/ subdirectory must exist
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
cinfoBytes, err := json.Marshal(ContentedMetadata{
|
||||
FileHash: albumHash,
|
||||
FileSize: int64(len(albumJson)),
|
||||
UploadTime: info.CreatedAt,
|
||||
UploadIP: "n/a",
|
||||
Filename: info.InventName(),
|
||||
MimeType: "contented/album",
|
||||
})
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
err = bb.Put([]byte(albumHash), cinfoBytes)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
addAlbumCount += 1
|
||||
}
|
||||
|
||||
log.Printf("Added %d album entries OK with %d additional image entries", addAlbumCount, addMediaCount)
|
||||
|
||||
log.Printf("There are %d albums with no images", albumsWithNoImagesCount)
|
||||
|
||||
// Fully imported
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
32
yatwiki-scrape/scrape-wikidb.php
Normal file
32
yatwiki-scrape/scrape-wikidb.php
Normal file
@ -0,0 +1,32 @@
|
||||
#!/usr/bin/php
|
||||
<?php
|
||||
|
||||
$db = new \PDO("sqlite:wiki.db");
|
||||
$matches = [];
|
||||
|
||||
$links = [];
|
||||
|
||||
foreach($db->query('SELECT id, body FROM articles') as $article) {
|
||||
$body = gzinflate($article['body']);
|
||||
|
||||
preg_match_all('~\[imgur\](.+?)\[~', $body, $matches);
|
||||
|
||||
if (count($matches)) {
|
||||
foreach($matches[1] as $short) {
|
||||
$links[] = 'https://i.imgur.com/'.$short;
|
||||
}
|
||||
}
|
||||
|
||||
// Inline links
|
||||
preg_match_all('~https?://[^ \t\n"><\]\[]+imgur.com[^ \t\n"><\]\[]*~', $body, $matches);
|
||||
if (count($matches)) {
|
||||
foreach($matches[0] as $link) {
|
||||
$links[] = $link;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Output
|
||||
foreach($links as $link) {
|
||||
echo str_replace("https://i.imgur.com/http://i.imgur.com", "https://i.imgur.com", $link)."\n";
|
||||
}
|
Loading…
Reference in New Issue
Block a user