myrunningmancom-scraper/download-pages.sh

40 lines
882 B
Bash
Executable File

#!/bin/bash
set -eu
BASEURL=https://www.myrunningman.com
# Update magnet rss.xml
wget "${BASEURL}/rss.xml" -O rss.xml
# Find latest episode
MAX_EPISODE=$(fgrep '<title>' rss.xml | sort | tail -n-1 | egrep -Eo '[0-9]{3,4}')
echo $MAX_EPISODE > max_episode
# Scrape pages and thumbnails
mkdir -p {original-html,thumb}
for i in $(seq 1 ${MAX_EPISODE}) ; do
# Raw HTML
if [[ ! -f original-html/$i ]] ; then
wget "${BASEURL}/ep/$i" -O original-html/$i
fi
# thumbnails
PADNAME=$(printf "%03d" $i)
if [[ ! -f thumb/${PADNAME}.jpg ]] ; then
if [[ $i == 310 ]] ; then
echo "no image for 310"
elif [[ $i < 397 ]] ; then
wget "${BASEURL}/assets/epimg/${PADNAME}.jpg" -O thumb/${PADNAME}.jpg
else
# _temp suffix for 397++
wget "${BASEURL}/assets/epimg/${PADNAME}_temp.jpg" -O thumb/${PADNAME}.jpg
fi
fi
done
echo "Finished (up to episode ${MAX_EPISODE})"