From b076f4d2f252f6d9dc1006ced22f2dcecbbdfe2c Mon Sep 17 00:00:00 2001 From: Alessio Date: Sun, 3 Mar 2024 14:56:01 -0800 Subject: [PATCH] Scraper: handle videos being geo-blocked, which was panicking the downloader --- pkg/persistence/media_download.go | 7 ++++++- pkg/scraper/api_types.go | 4 ++++ pkg/scraper/video.go | 2 ++ pkg/scraper/video_test.go | 23 +++++++++++++++++++---- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/pkg/persistence/media_download.go b/pkg/persistence/media_download.go index 502a29a..730a4c4 100644 --- a/pkg/persistence/media_download.go +++ b/pkg/persistence/media_download.go @@ -27,7 +27,7 @@ var ErrorDMCA error = errors.New("video is DMCAed, unable to download (HTTP 403 // - url: the remote file to download // - outpath: the path on disk to save it to func (d DefaultDownloader) Curl(url string, outpath string) error { - println(url) + fmt.Println(url) resp, err := http.Get(url) if err != nil { return fmt.Errorf("Error executing HTTP GET(%q):\n %w", url, err) @@ -150,6 +150,11 @@ func (p Profile) DownloadTweetContentWithInjector(t *scraper.Tweet, downloader M } for i := range t.Videos { + // Videos can be geoblocked, and the HTTP response isn't in JSON so it's hard to capture + if t.Videos[i].IsGeoblocked { + continue + } + err := p.download_tweet_video(&t.Videos[i], downloader) if err != nil { return err diff --git a/pkg/scraper/api_types.go b/pkg/scraper/api_types.go index 9caaf36..00d177f 100644 --- a/pkg/scraper/api_types.go +++ b/pkg/scraper/api_types.go @@ -38,6 +38,10 @@ type APIExtendedMedia struct { Variants SortableVariants `json:"variants"` Duration int `json:"duration_millis"` } `json:"video_info"` + ExtMediaAvailability struct { + Status string `json:"status"` + Reason string `json:"reason"` + } `json:"ext_media_availability"` OriginalInfo struct { Width int `json:"width"` Height int `json:"height"` diff --git a/pkg/scraper/video.go b/pkg/scraper/video.go index a3c3ae5..947128e 100644 --- a/pkg/scraper/video.go +++ b/pkg/scraper/video.go @@ -26,6 +26,7 @@ type Video struct { IsDownloaded bool `db:"is_downloaded"` IsBlockedByDMCA bool `db:"is_blocked_by_dmca"` + IsGeoblocked bool `db:"is_geoblocked"` IsGif bool `db:"is_gif"` } @@ -78,6 +79,7 @@ func ParseAPIVideo(apiVideo APIExtendedMedia, tweet_id TweetID) Video { IsDownloaded: false, IsBlockedByDMCA: false, + IsGeoblocked: apiVideo.ExtMediaAvailability.Reason == "Geoblocked", IsGif: apiVideo.Type == "animated_gif", } } diff --git a/pkg/scraper/video_test.go b/pkg/scraper/video_test.go index 1932ce5..a394877 100644 --- a/pkg/scraper/video_test.go +++ b/pkg/scraper/video_test.go @@ -13,13 +13,13 @@ import ( func TestParseAPIVideo(t *testing.T) { assert := assert.New(t) + require := require.New(t) data, err := os.ReadFile("test_responses/tweet_content/video.json") - if err != nil { - panic(err) - } + require.NoError(err) + var apivideo APIExtendedMedia err = json.Unmarshal(data, &apivideo) - require.NoError(t, err) + require.NoError(err) tweet_id := TweetID(28) video := ParseAPIVideo(apivideo, tweet_id) @@ -35,3 +35,18 @@ func TestParseAPIVideo(t *testing.T) { assert.Equal(88300, video.Duration) assert.False(video.IsDownloaded) } + +func TestParseGeoblockedVideo(t *testing.T) { + assert := assert.New(t) + require := require.New(t) + data, err := os.ReadFile("test_responses/tweet_content/video_geoblocked.json") + require.NoError(err) + + var apivideo APIExtendedMedia + err = json.Unmarshal(data, &apivideo) + require.NoError(err) + + tweet_id := TweetID(28) + video := ParseAPIVideo(apivideo, tweet_id) + assert.True(video.IsGeoblocked) +}