myrunningmancom-scraper/pages-to-json.php

87 lines
2.2 KiB
PHP
Executable File

#!/usr/bin/php
<?php
error_reporting(E_ALL);
function parse_episode($html) {
$matches = [];
preg_match('~<title>Episode #([0-9]{3}) - (.+)</title>~', $html, $matches);
$title = html_entity_decode($matches[2]);
$title = str_replace(' - My Running Man (MyRM)', '', $title);
$episode_num = $matches[1];
$stream_link = '';
if (preg_match('~data-url="([^"]+)"~', $html, $matches)) {
$stream_link = html_entity_decode($matches[1]);
} else {
error_log("WARNING: no stream link for episode $episode_num");
}
preg_match_all(
// <a href="/tag/photo" title="Added by andrew">photo</a>
'~<a href="/tag/[^"]+" title="Added by ([^"]+)">([^<]+)</a>~ms',
$html,
$matches,
PREG_SET_ORDER
);
$tags = [];
foreach($matches as $match) {
$tags[html_entity_decode($match[2])] = html_entity_decode($match[1]);
}
preg_match('~Broadcast Date: ([0-9-]+)~', $html, $matches);
$broadcast_date = $matches[1];
$filming_date = '';
if (preg_match('~filmed on ([0-9-]+)~', $html, $matches)) {
$filming_date = $matches[1];
} else {
error_log("WARNING: no filming date for episode $episode_num");
}
$location = '';
if (preg_match('~Location: ([^<]+)<~', $html, $matches)) {
$location = html_entity_decode($matches[1]);
}
preg_match('~href="(magnet:[^"]+)"~', $html, $matches);
$torrent = html_entity_decode($matches[1]);
$description = preg_match('~Description: ([^<]+)<~', $html, $matches)
? html_entity_decode($matches[1])
: "";
$ret = [
'title' => $title,
'stream' => $stream_link,
'tags' => $tags,
'broadcast_date' => $broadcast_date,
'filming_date' => $filming_date,
'location' => $location,
'description' => $description,
'torrent' => $torrent,
];
return $ret;
}
function main() {
$max_episode = intval(file_get_contents('max_episode'));
$ret = [];
for($i = 1; $i < $max_episode+1; ++$i) {
$html = file_get_contents('original-html/'.$i);
$info = parse_episode($html);
$ret[$i] = $info;
}
echo json_encode($ret, JSON_PRETTY_PRINT);
}
main($_SERVER['argv']);