159 lines
3.9 KiB
PHP
159 lines
3.9 KiB
PHP
|
#!/usr/bin/php
|
||
|
<?php
|
||
|
|
||
|
error_reporting(E_ALL);
|
||
|
define('IMGUR_CLIENT_ID', '546c25a59c58ad7'); // extract from any web interface pageview
|
||
|
|
||
|
function is_png($data) {
|
||
|
return (substr($data, 0, 4) === "\x89PNG");
|
||
|
}
|
||
|
|
||
|
function is_jpg($data) {
|
||
|
if (substr($data, 0, 3) === "\xFF\xD8\xFF") {
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
return str_contains(substr($data, 0, 64), "JFIF");
|
||
|
}
|
||
|
|
||
|
function is_gif($data) {
|
||
|
return (substr($data, 0, 4) === "GIF8");
|
||
|
}
|
||
|
|
||
|
function is_mp4($data) {
|
||
|
return str_contains(substr($data, 0, 64), "isom");
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* @param $type 'media' or 'albums'
|
||
|
*/
|
||
|
function imgur_mediainfo(string $code, string $type) {
|
||
|
|
||
|
// n.b. 'posts' is often a synonym for 'media' with the same result
|
||
|
// There is also a /meta endpoint possible after {$code}
|
||
|
|
||
|
$ret = file_get_contents("https://api.imgur.com/post/v1/{$type}/{$code}?client_id=" . IMGUR_CLIENT_ID . "&include=media");
|
||
|
if ($ret === false) {
|
||
|
throw new Exception("Failed API lookup for {$code} as {$type}");
|
||
|
}
|
||
|
|
||
|
if ($ret[0] !== '{') {
|
||
|
throw new Exception("API result for {$code} as {$type} got unexpected body: {$ret}");
|
||
|
}
|
||
|
|
||
|
$obj = json_decode($ret, true);
|
||
|
if (isset($obj['errors'])) {
|
||
|
throw new Exception("API result for {$code} as {$type} got unexpected body: {$ret}");
|
||
|
}
|
||
|
|
||
|
if ( ($type === 'albums') !== $obj['is_album'] ) {
|
||
|
throw new Exception("Unexpected type mismatch from API");
|
||
|
}
|
||
|
|
||
|
return $obj;
|
||
|
}
|
||
|
|
||
|
function imgur_download_single(string $code, string $type) {
|
||
|
echo "Downloading {$code}...\n";
|
||
|
|
||
|
if (file_exists("errors.{$type}/".$code)) {
|
||
|
echo "skipping (known error)\n";
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
if (! file_exists("metadata.{$type}/".$code)) {
|
||
|
try {
|
||
|
$metadata = imgur_mediainfo($code, $type);
|
||
|
|
||
|
} catch (Exception $ex) {
|
||
|
echo "WARNING: metadata download failed\n";
|
||
|
echo (string)$ex . "\n";
|
||
|
|
||
|
file_put_contents("errors.{$type}/".$code, json_encode([
|
||
|
"message" => $ex->getMessage(),
|
||
|
"failure_time" => date(DATE_RSS),
|
||
|
]));
|
||
|
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
file_put_contents("metadata.{$type}/".$code, json_encode($metadata));
|
||
|
|
||
|
} else {
|
||
|
$metadata = json_decode(file_get_contents("metadata.{$type}/".$code), true);
|
||
|
}
|
||
|
|
||
|
echo "code {$code} (type={$type}) contains ".$metadata['image_count']." media entries\n";
|
||
|
if ($metadata['image_count'] == 0) {
|
||
|
echo "WARNING: weird album with no images!\n";
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
foreach($metadata['media'] as $single) {
|
||
|
|
||
|
echo "- entry ".$single['id']."\n";
|
||
|
|
||
|
if (file_exists("images/".$single['id'])) {
|
||
|
echo "already exists (OK)\n";
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// Download whole URL
|
||
|
$ret = file_get_contents($single['url']);
|
||
|
if ($ret === false) {
|
||
|
echo "download failed (WARNING)\n";
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
if (! is_png($ret) && ! is_jpg($ret) && ! is_gif($ret) && ! is_mp4($ret)) {
|
||
|
echo "unexpected result, not jpg/gif/png/mp4 (WARNING)\n";
|
||
|
file_put_contents("images/".$single['id'].".INVALID-NOT-AN-IMAGE", $ret);
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
if (strlen($ret) === 503 && md5($ret) == "d835884373f4d6c8f24742ceabe74946") {
|
||
|
echo "fake image result for image not found (WARNING)\n";
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
file_put_contents("images/".$single['id'], $ret);
|
||
|
}
|
||
|
|
||
|
// all done
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
function main() {
|
||
|
|
||
|
foreach(['images', 'metadata.albums', 'metadata.media', 'errors.albums', 'errors.media'] as $dirname) {
|
||
|
if (! is_dir($dirname)) {
|
||
|
mkdir($dirname);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
$urls = explode("\n", file_get_contents("all-urls.txt"));
|
||
|
$matches = [];
|
||
|
|
||
|
foreach($urls as $url) {
|
||
|
|
||
|
if (preg_match("~^https://(i\\.|m\\.|img\\.|www\.)?imgur\\.com/([0-9a-zA-Z,]+)\\.?(jpg|jpeg|gif|webm|png|gifv|mp4)?(#.+)?$~", $url, $matches)) {
|
||
|
|
||
|
foreach(explode(',', $matches[2]) as $single_id) {
|
||
|
imgur_download_single($single_id, 'media');
|
||
|
}
|
||
|
|
||
|
} else if (preg_match("~^https://(i\\.|m\\.|img\\.|www\.)?imgur\\.com/(a/|gallery/|(?:r|t|topic)/[a-zA-Z0-9_]+/)?([0-9a-zA-Z,]+)\\.?(jpg|jpeg|gif|webm|png|gifv|mp4)?(#.+)?$~", $url, $matches)) {
|
||
|
|
||
|
imgur_download_single($matches[3], 'albums');
|
||
|
|
||
|
} else {
|
||
|
echo "WARNING: Unsupported URL: {$url}\n";
|
||
|
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
main();
|