From 8366e4a29ce678bd5aff6843b408361db14c5ec8 Mon Sep 17 00:00:00 2001 From: mappu Date: Mon, 12 Apr 2021 11:21:59 +1200 Subject: [PATCH] extract scraper and subtitle writer to separate files --- main.go | 169 --------------------------------------------------- scrape.go | 119 ++++++++++++++++++++++++++++++++++++ writesubs.go | 61 +++++++++++++++++++ 3 files changed, 180 insertions(+), 169 deletions(-) create mode 100644 scrape.go create mode 100644 writesubs.go diff --git a/main.go b/main.go index 3c30099..5d8aad8 100644 --- a/main.go +++ b/main.go @@ -2,8 +2,6 @@ package main import ( "context" - "encoding/json" - "errors" "flag" "fmt" "hash/crc32" @@ -13,175 +11,8 @@ import ( "os" "os/exec" "path/filepath" - "regexp" - "strconv" - "strings" - "time" ) -type loadTupContent struct { - Title string - VideoID string - Interval float64 - Secs []float64 - Caps []string - Scrsecs []float64 - Scrcaps []string -} - -func parse_json_floats(part []byte) ([]float64, error) { - secsStrings := make([]string, 0) - err := json.Unmarshal(part, &secsStrings) - if err != nil { - return nil, err - } - ret := make([]float64, 0, len(secsStrings)) - for _, sec := range secsStrings { - ff, err := strconv.ParseFloat(sec, 64) - if err != nil { - return nil, err - } - ret = append(ret, ff) - } - return ret, nil -} - -func NewLoadTupContent(content []byte) (*loadTupContent, error) { - - var err error - rx := regexp.MustCompile(`videoIdMain = "([^"]+)";var interval = (\d+);var secs = (.+?);var caps = (.+?);var scrsecs = (.+?);var scrcaps = (.+?);`) - - parts := rx.FindStringSubmatch(string(content)) - if parts == nil { - return nil, errors.New("Missing video properties in HTML page") - } - - if len(parts) != 7 { - return nil, fmt.Errorf("expected len(parts)=7, got %d", len(parts)) - } - - ltc := loadTupContent{} - ltc.VideoID = parts[1] - ltc.Interval, err = strconv.ParseFloat(parts[2], 64) - if err != nil { - return nil, err - } - - ltc.Secs, err = parse_json_floats([]byte(parts[3])) - if err != nil { - return nil, err - } - - err = json.Unmarshal([]byte(parts[4]), <c.Caps) - if err != nil { - return nil, err - } - - ltc.Scrsecs, err = parse_json_floats([]byte(parts[5])) - if err != nil { - return nil, err - } - - err = json.Unmarshal([]byte(parts[6]), <c.Scrcaps) - if err != nil { - return nil, err - } - - // Parse the page title - - rx = regexp.MustCompile(`(?ms)

(.+?)

`) - parts = rx.FindStringSubmatch(string(content)) - if parts == nil { - return nil, errors.New("Missing title in HTML page") - } - - if len(parts) != 2 { - return nil, fmt.Errorf("expected len(parts)=2, got %d", len(parts)) - } - - ltc.Title = strings.TrimSpace(parts[1]) - - return <c, nil -} - -func (ltc *loadTupContent) Validate() error { - if len(ltc.Secs) != len(ltc.Caps) { - return fmt.Errorf("secs/caps length mismatch") - } - - if len(ltc.Scrsecs) != len(ltc.Scrcaps) { - return fmt.Errorf("scrsecs/scrcaps length mismatch") - } - - if !(len(ltc.Scrcaps) == 0 || ltc.Scrcaps[0] == "") { - return errors.New("unsupported use of strcaps") - } - - if len(ltc.VideoID) == 0 { - return errors.New("unexpected blank video ID") - } - - if ltc.Interval != 100.0 { - return errors.New("unsupported non-100 duration field") - } - - return nil -} - -func secs_to_srt_time(secs float64) string { - dur := time.Duration(secs) * time.Second - - hh := int64(dur.Hours()) - mm := int64(dur.Minutes()) - (hh * 60) - ss := int64(dur.Seconds()) - (hh * 3600) - (mm * 60) - ms := int64(dur.Milliseconds()) - (hh * 3600000) - (mm * 60000) - (ss * 1000) - - return fmt.Sprintf("%02d:%02d:%02d,%03d", hh, mm, ss, ms) -} - -func (ltc *loadTupContent) WriteSRT(w io.Writer) error { - /* - - SRT file format (example from Wikipedia): - - 1 - 00:02:17,440 --> 00:02:20,375 - Senator, we're making - our final approach into Coruscant. - - 2 - 00:02:20,476 --> 00:02:22,501 - Very good, Lieutenant. - */ - - ctr := 1 - for i := 0; i < len(ltc.Caps); i += 1 { - if ltc.Caps[i] == "" { - // Don't show anything - continue - } - - start := secs_to_srt_time(ltc.Secs[i]) - var end string - if i < len(ltc.Caps)-1 { - end = secs_to_srt_time(ltc.Secs[i+1]) - } else { - // The final subtitle. We don't know how long it should be displayed - // for since we don't know the entire video's duration - // FIXME supply - // Assume 3 seconds - end = secs_to_srt_time(ltc.Secs[i] + 3) - } - - fmt.Fprintf(w, "%d\n%s --> %s\n%s\n\n", - ctr, start, end, ltc.Caps[i]) - // We emitted a message, increase the counter - ctr += 1 - } - - return nil -} - type config struct { youtubeDl string mkvmerge string diff --git a/scrape.go b/scrape.go new file mode 100644 index 0000000..6bcfb1a --- /dev/null +++ b/scrape.go @@ -0,0 +1,119 @@ +package main + +import ( + "encoding/json" + "errors" + "fmt" + "regexp" + "strconv" + "strings" +) + +type loadTupContent struct { + Title string + VideoID string + Interval float64 + Secs []float64 + Caps []string + Scrsecs []float64 + Scrcaps []string +} + +func parse_json_floats(part []byte) ([]float64, error) { + secsStrings := make([]string, 0) + err := json.Unmarshal(part, &secsStrings) + if err != nil { + return nil, err + } + ret := make([]float64, 0, len(secsStrings)) + for _, sec := range secsStrings { + ff, err := strconv.ParseFloat(sec, 64) + if err != nil { + return nil, err + } + ret = append(ret, ff) + } + return ret, nil +} + +func NewLoadTupContent(content []byte) (*loadTupContent, error) { + + var err error + rx := regexp.MustCompile(`videoIdMain = "([^"]+)";var interval = (\d+);var secs = (.+?);var caps = (.+?);var scrsecs = (.+?);var scrcaps = (.+?);`) + + parts := rx.FindStringSubmatch(string(content)) + if parts == nil { + return nil, errors.New("Missing video properties in HTML page") + } + + if len(parts) != 7 { + return nil, fmt.Errorf("expected len(parts)=7, got %d", len(parts)) + } + + ltc := loadTupContent{} + ltc.VideoID = parts[1] + ltc.Interval, err = strconv.ParseFloat(parts[2], 64) + if err != nil { + return nil, err + } + + ltc.Secs, err = parse_json_floats([]byte(parts[3])) + if err != nil { + return nil, err + } + + err = json.Unmarshal([]byte(parts[4]), <c.Caps) + if err != nil { + return nil, err + } + + ltc.Scrsecs, err = parse_json_floats([]byte(parts[5])) + if err != nil { + return nil, err + } + + err = json.Unmarshal([]byte(parts[6]), <c.Scrcaps) + if err != nil { + return nil, err + } + + // Parse the page title + + rx = regexp.MustCompile(`(?ms)

(.+?)

`) + parts = rx.FindStringSubmatch(string(content)) + if parts == nil { + return nil, errors.New("Missing title in HTML page") + } + + if len(parts) != 2 { + return nil, fmt.Errorf("expected len(parts)=2, got %d", len(parts)) + } + + ltc.Title = strings.TrimSpace(parts[1]) + + return <c, nil +} + +func (ltc *loadTupContent) Validate() error { + if len(ltc.Secs) != len(ltc.Caps) { + return fmt.Errorf("secs/caps length mismatch") + } + + if len(ltc.Scrsecs) != len(ltc.Scrcaps) { + return fmt.Errorf("scrsecs/scrcaps length mismatch") + } + + if !(len(ltc.Scrcaps) == 0 || ltc.Scrcaps[0] == "") { + return errors.New("unsupported use of strcaps") + } + + if len(ltc.VideoID) == 0 { + return errors.New("unexpected blank video ID") + } + + if ltc.Interval != 100.0 { + return errors.New("unsupported non-100 duration field") + } + + return nil +} diff --git a/writesubs.go b/writesubs.go new file mode 100644 index 0000000..4994afd --- /dev/null +++ b/writesubs.go @@ -0,0 +1,61 @@ +package main + +import ( + "fmt" + "io" + "time" +) + +func secs_to_srt_time(secs float64) string { + dur := time.Duration(secs) * time.Second + + hh := int64(dur.Hours()) + mm := int64(dur.Minutes()) - (hh * 60) + ss := int64(dur.Seconds()) - (hh * 3600) - (mm * 60) + ms := int64(dur.Milliseconds()) - (hh * 3600000) - (mm * 60000) - (ss * 1000) + + return fmt.Sprintf("%02d:%02d:%02d,%03d", hh, mm, ss, ms) +} + +func (ltc *loadTupContent) WriteSRT(w io.Writer) error { + /* + + SRT file format (example from Wikipedia): + + 1 + 00:02:17,440 --> 00:02:20,375 + Senator, we're making + our final approach into Coruscant. + + 2 + 00:02:20,476 --> 00:02:22,501 + Very good, Lieutenant. + */ + + ctr := 1 + for i := 0; i < len(ltc.Caps); i += 1 { + if ltc.Caps[i] == "" { + // Don't show anything + continue + } + + start := secs_to_srt_time(ltc.Secs[i]) + var end string + if i < len(ltc.Caps)-1 { + end = secs_to_srt_time(ltc.Secs[i+1]) + } else { + // The final subtitle. We don't know how long it should be displayed + // for since we don't know the entire video's duration + // FIXME supply + // Assume 3 seconds + end = secs_to_srt_time(ltc.Secs[i] + 3) + } + + fmt.Fprintf(w, "%d\n%s --> %s\n%s\n\n", + ctr, start, end, ltc.Caps[i]) + // We emitted a message, increase the counter + ctr += 1 + } + + return nil +}