From edf934c6a6a81cefad528c1897cd2db3f6236557 Mon Sep 17 00:00:00 2001 From: mappu04 Date: Thu, 15 Jun 2023 19:49:04 +1200 Subject: [PATCH] combine download-pages and download-thumbs; stash max_episode file --- download-pages.sh | 38 +++++++++++++++++++++++++++++++++++++- download-thumbs.sh | 6 ------ 2 files changed, 37 insertions(+), 7 deletions(-) mode change 100644 => 100755 download-pages.sh delete mode 100644 download-thumbs.sh diff --git a/download-pages.sh b/download-pages.sh old mode 100644 new mode 100755 index fca677c..ceab551 --- a/download-pages.sh +++ b/download-pages.sh @@ -1,3 +1,39 @@ #!/bin/bash -for i in {1..600} ; do wget 'https://www.myrunningman.com/ep/'$i ; done +set -eu +BASEURL=https://www.myrunningman.com + +# Update magnet rss.xml + +wget "${BASEURL}/rss.xml" -O rss.xml + +# Find latest episode + +MAX_EPISODE=$(fgrep '' rss.xml | sort | tail -n-1 | egrep -Eo '[0-9]{3,4}') +echo $MAX_EPISODE > max_episode + +# Scrape pages and thumbnails + +mkdir -p {original-html,thumb} +for i in $(seq 1 ${MAX_EPISODE}) ; do + + # Raw HTML + if [[ ! -f original-html/$i ]] ; then + wget "${BASEURL}/ep/$i" -O original-html/$i + fi + + # thumbnails + PADNAME=$(printf "%03d" $i) + if [[ ! -f thumb/${PADNAME}.jpg ]] ; then + if [[ $i == 310 ]] ; then + echo "no image for 310" + elif [[ $i < 397 ]] ; then + wget "${BASEURL}/assets/epimg/${PADNAME}.jpg" -O thumb/${PADNAME}.jpg + else + # _temp suffix for 397++ + wget "${BASEURL}/assets/epimg/${PADNAME}_temp.jpg" -O thumb/${PADNAME}.jpg + fi + fi +done + +echo "Finished (up to episode ${MAX_EPISODE})" diff --git a/download-thumbs.sh b/download-thumbs.sh deleted file mode 100644 index 9904b7a..0000000 --- a/download-thumbs.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -for i in {1..600} ; do wget 'https://www.myrunningman.com/assets/epimg/'$(printf "%03d" $i)'.jpg' ; done - -for i in {397..600} ; do wget 'https://www.myrunningman.com/assets/epimg/'$(printf "%03d" $i)_temp'.jpg' ; done -