imgur-rescue-project/collect/collect.php

159 lines
3.9 KiB
PHP
Executable File

#!/usr/bin/php
<?php
error_reporting(E_ALL);
define('IMGUR_CLIENT_ID', '546c25a59c58ad7'); // extract from any web interface pageview
function is_png($data) {
return (substr($data, 0, 4) === "\x89PNG");
}
function is_jpg($data) {
if (substr($data, 0, 3) === "\xFF\xD8\xFF") {
return true;
}
return str_contains(substr($data, 0, 64), "JFIF");
}
function is_gif($data) {
return (substr($data, 0, 4) === "GIF8");
}
function is_mp4($data) {
return str_contains(substr($data, 0, 64), "isom");
}
/**
* @param $type 'media' or 'albums'
*/
function imgur_mediainfo(string $code, string $type) {
// n.b. 'posts' is often a synonym for 'media' with the same result
// There is also a /meta endpoint possible after {$code}
$ret = file_get_contents("https://api.imgur.com/post/v1/{$type}/{$code}?client_id=" . IMGUR_CLIENT_ID . "&include=media");
if ($ret === false) {
throw new Exception("Failed API lookup for {$code} as {$type}");
}
if ($ret[0] !== '{') {
throw new Exception("API result for {$code} as {$type} got unexpected body: {$ret}");
}
$obj = json_decode($ret, true);
if (isset($obj['errors'])) {
throw new Exception("API result for {$code} as {$type} got unexpected body: {$ret}");
}
if ( ($type === 'albums') !== $obj['is_album'] ) {
throw new Exception("Unexpected type mismatch from API");
}
return $obj;
}
function imgur_download_single(string $code, string $type) {
echo "Downloading {$code}...\n";
if (file_exists("errors.{$type}/".$code)) {
echo "skipping (known error)\n";
return false;
}
if (! file_exists("metadata.{$type}/".$code)) {
try {
$metadata = imgur_mediainfo($code, $type);
} catch (Exception $ex) {
echo "WARNING: metadata download failed\n";
echo (string)$ex . "\n";
file_put_contents("errors.{$type}/".$code, json_encode([
"message" => $ex->getMessage(),
"failure_time" => date(DATE_RSS),
]));
return false;
}
file_put_contents("metadata.{$type}/".$code, json_encode($metadata));
} else {
$metadata = json_decode(file_get_contents("metadata.{$type}/".$code), true);
}
echo "code {$code} (type={$type}) contains ".$metadata['image_count']." media entries\n";
if ($metadata['image_count'] == 0) {
echo "WARNING: weird album with no images!\n";
return true;
}
foreach($metadata['media'] as $single) {
echo "- entry ".$single['id']."\n";
if (file_exists("images/".$single['id'])) {
echo "already exists (OK)\n";
continue;
}
// Download whole URL
$ret = file_get_contents($single['url']);
if ($ret === false) {
echo "download failed (WARNING)\n";
continue;
}
if (! is_png($ret) && ! is_jpg($ret) && ! is_gif($ret) && ! is_mp4($ret)) {
echo "unexpected result, not jpg/gif/png/mp4 (WARNING)\n";
file_put_contents("images/".$single['id'].".INVALID-NOT-AN-IMAGE", $ret);
continue;
}
if (strlen($ret) === 503 && md5($ret) == "d835884373f4d6c8f24742ceabe74946") {
echo "fake image result for image not found (WARNING)\n";
continue;
}
file_put_contents("images/".$single['id'], $ret);
}
// all done
return true;
}
function main() {
foreach(['images', 'metadata.albums', 'metadata.media', 'errors.albums', 'errors.media'] as $dirname) {
if (! is_dir($dirname)) {
mkdir($dirname);
}
}
$urls = explode("\n", file_get_contents("all-urls.txt"));
$matches = [];
foreach($urls as $url) {
if (preg_match("~^https://(i\\.|m\\.|img\\.|www\.)?imgur\\.com/([0-9a-zA-Z,]+)\\.?(jpg|jpeg|gif|webm|png|gifv|mp4)?(#.+)?$~", $url, $matches)) {
foreach(explode(',', $matches[2]) as $single_id) {
imgur_download_single($single_id, 'media');
}
} else if (preg_match("~^https://(i\\.|m\\.|img\\.|www\.)?imgur\\.com/(a/|gallery/|(?:r|t|topic)/[a-zA-Z0-9_]+/)?([0-9a-zA-Z,]+)\\.?(jpg|jpeg|gif|webm|png|gifv|mp4)?(#.+)?$~", $url, $matches)) {
imgur_download_single($matches[3], 'albums');
} else {
echo "WARNING: Unsupported URL: {$url}\n";
}
}
}
main();