Episode #([0-9]{3}) - (.+)~', $html, $matches); $title = html_entity_decode($matches[2]); $title = str_replace(' - My Running Man (MyRM)', '', $title); $episode_num = $matches[1]; $stream_link = ''; if (preg_match('~data-url="([^"]+)"~', $html, $matches)) { $stream_link = html_entity_decode($matches[1]); } else { error_log("WARNING: no stream link for episode $episode_num"); } preg_match_all( // photo '~([^<]+)~ms', $html, $matches, PREG_SET_ORDER ); $tags = []; foreach($matches as $match) { $tags[html_entity_decode($match[2])] = html_entity_decode($match[1]); } preg_match('~Broadcast Date: ([0-9-]+)~', $html, $matches); $broadcast_date = $matches[1]; $filming_date = ''; if (preg_match('~filmed on ([0-9-]+)~', $html, $matches)) { $filming_date = $matches[1]; } else { error_log("WARNING: no filming date for episode $episode_num"); } $location = ''; if (preg_match('~Location: ([^<]+)<~', $html, $matches)) { $location = html_entity_decode($matches[1]); } preg_match('~href="(magnet:[^"]+)"~', $html, $matches); $torrent = html_entity_decode($matches[1]); $description = preg_match('~Description: ([^<]+)<~', $html, $matches) ? html_entity_decode($matches[1]) : ""; $ret = [ 'title' => $title, 'stream' => $stream_link, 'tags' => $tags, 'broadcast_date' => $broadcast_date, 'filming_date' => $filming_date, 'location' => $location, 'description' => $description, 'torrent' => $torrent, ]; return $ret; } function main() { $ret = []; for($i = 1; $i < 601; ++$i) { $html = file_get_contents('running-pages/'.$i); $info = parse_episode($html); $ret[$i] = $info; } echo json_encode($ret, JSON_PRETTY_PRINT); } main($_SERVER['argv']);