extract scraper and subtitle writer to separate files

This commit is contained in:
mappu 2021-04-12 11:21:59 +12:00
parent 5163160bf3
commit 8366e4a29c
3 changed files with 180 additions and 169 deletions

169
main.go
View File

@ -2,8 +2,6 @@ package main
import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"hash/crc32"
@ -13,175 +11,8 @@ import (
"os"
"os/exec"
"path/filepath"
"regexp"
"strconv"
"strings"
"time"
)
type loadTupContent struct {
Title string
VideoID string
Interval float64
Secs []float64
Caps []string
Scrsecs []float64
Scrcaps []string
}
func parse_json_floats(part []byte) ([]float64, error) {
secsStrings := make([]string, 0)
err := json.Unmarshal(part, &secsStrings)
if err != nil {
return nil, err
}
ret := make([]float64, 0, len(secsStrings))
for _, sec := range secsStrings {
ff, err := strconv.ParseFloat(sec, 64)
if err != nil {
return nil, err
}
ret = append(ret, ff)
}
return ret, nil
}
func NewLoadTupContent(content []byte) (*loadTupContent, error) {
var err error
rx := regexp.MustCompile(`videoIdMain = "([^"]+)";var interval = (\d+);var secs = (.+?);var caps = (.+?);var scrsecs = (.+?);var scrcaps = (.+?);</script>`)
parts := rx.FindStringSubmatch(string(content))
if parts == nil {
return nil, errors.New("Missing video properties in HTML page")
}
if len(parts) != 7 {
return nil, fmt.Errorf("expected len(parts)=7, got %d", len(parts))
}
ltc := loadTupContent{}
ltc.VideoID = parts[1]
ltc.Interval, err = strconv.ParseFloat(parts[2], 64)
if err != nil {
return nil, err
}
ltc.Secs, err = parse_json_floats([]byte(parts[3]))
if err != nil {
return nil, err
}
err = json.Unmarshal([]byte(parts[4]), &ltc.Caps)
if err != nil {
return nil, err
}
ltc.Scrsecs, err = parse_json_floats([]byte(parts[5]))
if err != nil {
return nil, err
}
err = json.Unmarshal([]byte(parts[6]), &ltc.Scrcaps)
if err != nil {
return nil, err
}
// Parse the page title
rx = regexp.MustCompile(`(?ms)<h2 style="margin:0 0 0 0;">(.+?)</h2>`)
parts = rx.FindStringSubmatch(string(content))
if parts == nil {
return nil, errors.New("Missing title in HTML page")
}
if len(parts) != 2 {
return nil, fmt.Errorf("expected len(parts)=2, got %d", len(parts))
}
ltc.Title = strings.TrimSpace(parts[1])
return &ltc, nil
}
func (ltc *loadTupContent) Validate() error {
if len(ltc.Secs) != len(ltc.Caps) {
return fmt.Errorf("secs/caps length mismatch")
}
if len(ltc.Scrsecs) != len(ltc.Scrcaps) {
return fmt.Errorf("scrsecs/scrcaps length mismatch")
}
if !(len(ltc.Scrcaps) == 0 || ltc.Scrcaps[0] == "") {
return errors.New("unsupported use of strcaps")
}
if len(ltc.VideoID) == 0 {
return errors.New("unexpected blank video ID")
}
if ltc.Interval != 100.0 {
return errors.New("unsupported non-100 duration field")
}
return nil
}
func secs_to_srt_time(secs float64) string {
dur := time.Duration(secs) * time.Second
hh := int64(dur.Hours())
mm := int64(dur.Minutes()) - (hh * 60)
ss := int64(dur.Seconds()) - (hh * 3600) - (mm * 60)
ms := int64(dur.Milliseconds()) - (hh * 3600000) - (mm * 60000) - (ss * 1000)
return fmt.Sprintf("%02d:%02d:%02d,%03d", hh, mm, ss, ms)
}
func (ltc *loadTupContent) WriteSRT(w io.Writer) error {
/*
SRT file format (example from Wikipedia):
1
00:02:17,440 --> 00:02:20,375
Senator, we're making
our final approach into Coruscant.
2
00:02:20,476 --> 00:02:22,501
Very good, Lieutenant.
*/
ctr := 1
for i := 0; i < len(ltc.Caps); i += 1 {
if ltc.Caps[i] == "" {
// Don't show anything
continue
}
start := secs_to_srt_time(ltc.Secs[i])
var end string
if i < len(ltc.Caps)-1 {
end = secs_to_srt_time(ltc.Secs[i+1])
} else {
// The final subtitle. We don't know how long it should be displayed
// for since we don't know the entire video's duration
// FIXME supply
// Assume 3 seconds
end = secs_to_srt_time(ltc.Secs[i] + 3)
}
fmt.Fprintf(w, "%d\n%s --> %s\n%s\n\n",
ctr, start, end, ltc.Caps[i])
// We emitted a message, increase the counter
ctr += 1
}
return nil
}
type config struct {
youtubeDl string
mkvmerge string

119
scrape.go Normal file
View File

@ -0,0 +1,119 @@
package main
import (
"encoding/json"
"errors"
"fmt"
"regexp"
"strconv"
"strings"
)
type loadTupContent struct {
Title string
VideoID string
Interval float64
Secs []float64
Caps []string
Scrsecs []float64
Scrcaps []string
}
func parse_json_floats(part []byte) ([]float64, error) {
secsStrings := make([]string, 0)
err := json.Unmarshal(part, &secsStrings)
if err != nil {
return nil, err
}
ret := make([]float64, 0, len(secsStrings))
for _, sec := range secsStrings {
ff, err := strconv.ParseFloat(sec, 64)
if err != nil {
return nil, err
}
ret = append(ret, ff)
}
return ret, nil
}
func NewLoadTupContent(content []byte) (*loadTupContent, error) {
var err error
rx := regexp.MustCompile(`videoIdMain = "([^"]+)";var interval = (\d+);var secs = (.+?);var caps = (.+?);var scrsecs = (.+?);var scrcaps = (.+?);</script>`)
parts := rx.FindStringSubmatch(string(content))
if parts == nil {
return nil, errors.New("Missing video properties in HTML page")
}
if len(parts) != 7 {
return nil, fmt.Errorf("expected len(parts)=7, got %d", len(parts))
}
ltc := loadTupContent{}
ltc.VideoID = parts[1]
ltc.Interval, err = strconv.ParseFloat(parts[2], 64)
if err != nil {
return nil, err
}
ltc.Secs, err = parse_json_floats([]byte(parts[3]))
if err != nil {
return nil, err
}
err = json.Unmarshal([]byte(parts[4]), &ltc.Caps)
if err != nil {
return nil, err
}
ltc.Scrsecs, err = parse_json_floats([]byte(parts[5]))
if err != nil {
return nil, err
}
err = json.Unmarshal([]byte(parts[6]), &ltc.Scrcaps)
if err != nil {
return nil, err
}
// Parse the page title
rx = regexp.MustCompile(`(?ms)<h2 style="margin:0 0 0 0;">(.+?)</h2>`)
parts = rx.FindStringSubmatch(string(content))
if parts == nil {
return nil, errors.New("Missing title in HTML page")
}
if len(parts) != 2 {
return nil, fmt.Errorf("expected len(parts)=2, got %d", len(parts))
}
ltc.Title = strings.TrimSpace(parts[1])
return &ltc, nil
}
func (ltc *loadTupContent) Validate() error {
if len(ltc.Secs) != len(ltc.Caps) {
return fmt.Errorf("secs/caps length mismatch")
}
if len(ltc.Scrsecs) != len(ltc.Scrcaps) {
return fmt.Errorf("scrsecs/scrcaps length mismatch")
}
if !(len(ltc.Scrcaps) == 0 || ltc.Scrcaps[0] == "") {
return errors.New("unsupported use of strcaps")
}
if len(ltc.VideoID) == 0 {
return errors.New("unexpected blank video ID")
}
if ltc.Interval != 100.0 {
return errors.New("unsupported non-100 duration field")
}
return nil
}

61
writesubs.go Normal file
View File

@ -0,0 +1,61 @@
package main
import (
"fmt"
"io"
"time"
)
func secs_to_srt_time(secs float64) string {
dur := time.Duration(secs) * time.Second
hh := int64(dur.Hours())
mm := int64(dur.Minutes()) - (hh * 60)
ss := int64(dur.Seconds()) - (hh * 3600) - (mm * 60)
ms := int64(dur.Milliseconds()) - (hh * 3600000) - (mm * 60000) - (ss * 1000)
return fmt.Sprintf("%02d:%02d:%02d,%03d", hh, mm, ss, ms)
}
func (ltc *loadTupContent) WriteSRT(w io.Writer) error {
/*
SRT file format (example from Wikipedia):
1
00:02:17,440 --> 00:02:20,375
Senator, we're making
our final approach into Coruscant.
2
00:02:20,476 --> 00:02:22,501
Very good, Lieutenant.
*/
ctr := 1
for i := 0; i < len(ltc.Caps); i += 1 {
if ltc.Caps[i] == "" {
// Don't show anything
continue
}
start := secs_to_srt_time(ltc.Secs[i])
var end string
if i < len(ltc.Caps)-1 {
end = secs_to_srt_time(ltc.Secs[i+1])
} else {
// The final subtitle. We don't know how long it should be displayed
// for since we don't know the entire video's duration
// FIXME supply
// Assume 3 seconds
end = secs_to_srt_time(ltc.Secs[i] + 3)
}
fmt.Fprintf(w, "%d\n%s --> %s\n%s\n\n",
ctr, start, end, ltc.Caps[i])
// We emitted a message, increase the counter
ctr += 1
}
return nil
}