initial commit
This commit is contained in:
commit
55e53294bd
21
README.md
Normal file
21
README.md
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
# myrunningmancom-scraper
|
||||||
|
|
||||||
|
A scraper for the website https://myrunningman.com/ .
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
1. `./download-pages.sh` to download HTML files only once
|
||||||
|
2. `./download-thumbs.sh` to collect thumbnails (optional)
|
||||||
|
3. `./running-parser.php` to parse HTML into final output.json data file
|
||||||
|
|
||||||
|
## Example output
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"title": "Times Square",
|
||||||
|
"broadcast_date": "2010-07-11",
|
||||||
|
"filming_date": "2010-06-21",
|
||||||
|
"location": "Times Square (Yeongdeungpo-gu, Seoul)",
|
||||||
|
"description": "A never-before-seen action variety show with an amazing cast. To start off the first episode, they head over to the T shopping mall in Seoul after closing hours. They will be split into two different teams and compete against each other for the passcode that will let them escape from the mall. From running around to find clues to ripping each other's name tag, no one can predict what will happen to them. Stay tuned to see which team emerges victorious.",
|
||||||
|
```
|
3
download-pages.sh
Normal file
3
download-pages.sh
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
for i in {1..600} ; do wget 'https://www.myrunningman.com/ep/'$i ; done
|
6
download-thumbs.sh
Normal file
6
download-thumbs.sh
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
for i in {1..600} ; do wget 'https://www.myrunningman.com/assets/epimg/'$(printf "%03d" $i)'.jpg' ; done
|
||||||
|
|
||||||
|
for i in {397..600} ; do wget 'https://www.myrunningman.com/assets/epimg/'$(printf "%03d" $i)_temp'.jpg' ; done
|
||||||
|
|
83
running-parser.php
Executable file
83
running-parser.php
Executable file
@ -0,0 +1,83 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
error_reporting(E_ALL);
|
||||||
|
|
||||||
|
function parse_episode($html) {
|
||||||
|
|
||||||
|
$matches = [];
|
||||||
|
|
||||||
|
preg_match('~<title>Episode #([0-9]{3}) - (.+)</title>~', $html, $matches);
|
||||||
|
$title = html_entity_decode($matches[2]);
|
||||||
|
$title = str_replace(' - My Running Man (MyRM)', '', $title);
|
||||||
|
|
||||||
|
$episode_num = $matches[1];
|
||||||
|
|
||||||
|
$stream_link = '';
|
||||||
|
if (preg_match('~data-url="([^"]+)"~', $html, $matches)) {
|
||||||
|
$stream_link = html_entity_decode($matches[1]);
|
||||||
|
} else {
|
||||||
|
error_log("WARNING: no stream link for episode $episode_num");
|
||||||
|
}
|
||||||
|
|
||||||
|
preg_match_all(
|
||||||
|
// <a href="/tag/photo" title="Added by andrew">photo</a>
|
||||||
|
'~<a href="/tag/[^"]+" title="Added by ([^"]+)">([^<]+)</a>~ms',
|
||||||
|
$html,
|
||||||
|
$matches,
|
||||||
|
PREG_SET_ORDER
|
||||||
|
);
|
||||||
|
$tags = [];
|
||||||
|
foreach($matches as $match) {
|
||||||
|
$tags[html_entity_decode($match[2])] = html_entity_decode($match[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
preg_match('~Broadcast Date: ([0-9-]+)~', $html, $matches);
|
||||||
|
$broadcast_date = $matches[1];
|
||||||
|
|
||||||
|
$filming_date = '';
|
||||||
|
if (preg_match('~filmed on ([0-9-]+)~', $html, $matches)) {
|
||||||
|
$filming_date = $matches[1];
|
||||||
|
} else {
|
||||||
|
error_log("WARNING: no filming date for episode $episode_num");
|
||||||
|
}
|
||||||
|
|
||||||
|
$location = '';
|
||||||
|
if (preg_match('~Location: ([^<]+)<~', $html, $matches)) {
|
||||||
|
$location = html_entity_decode($matches[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
preg_match('~href="(magnet:[^"]+)"~', $html, $matches);
|
||||||
|
$torrent = html_entity_decode($matches[1]);
|
||||||
|
|
||||||
|
$description = preg_match('~Description: ([^<]+)<~', $html, $matches)
|
||||||
|
? html_entity_decode($matches[1])
|
||||||
|
: "";
|
||||||
|
|
||||||
|
$ret = [
|
||||||
|
'title' => $title,
|
||||||
|
'stream' => $stream_link,
|
||||||
|
'tags' => $tags,
|
||||||
|
'broadcast_date' => $broadcast_date,
|
||||||
|
'filming_date' => $filming_date,
|
||||||
|
'location' => $location,
|
||||||
|
'description' => $description,
|
||||||
|
'torrent' => $torrent,
|
||||||
|
];
|
||||||
|
|
||||||
|
return $ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
function main() {
|
||||||
|
|
||||||
|
$ret = [];
|
||||||
|
for($i = 1; $i < 601; ++$i) {
|
||||||
|
$html = file_get_contents('running-pages/'.$i);
|
||||||
|
$info = parse_episode($html);
|
||||||
|
$ret[$i] = $info;
|
||||||
|
}
|
||||||
|
|
||||||
|
echo json_encode($ret, JSON_PRETTY_PRINT);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main($_SERVER['argv']);
|
Loading…
Reference in New Issue
Block a user