Scraper: handle videos being geo-blocked, which was panicking the downloader

This commit is contained in:
Alessio 2024-03-03 14:56:01 -08:00
parent c19d36d053
commit b076f4d2f2
4 changed files with 31 additions and 5 deletions

View File

@ -27,7 +27,7 @@ var ErrorDMCA error = errors.New("video is DMCAed, unable to download (HTTP 403
// - url: the remote file to download
// - outpath: the path on disk to save it to
func (d DefaultDownloader) Curl(url string, outpath string) error {
println(url)
fmt.Println(url)
resp, err := http.Get(url)
if err != nil {
return fmt.Errorf("Error executing HTTP GET(%q):\n %w", url, err)
@ -150,6 +150,11 @@ func (p Profile) DownloadTweetContentWithInjector(t *scraper.Tweet, downloader M
}
for i := range t.Videos {
// Videos can be geoblocked, and the HTTP response isn't in JSON so it's hard to capture
if t.Videos[i].IsGeoblocked {
continue
}
err := p.download_tweet_video(&t.Videos[i], downloader)
if err != nil {
return err

View File

@ -38,6 +38,10 @@ type APIExtendedMedia struct {
Variants SortableVariants `json:"variants"`
Duration int `json:"duration_millis"`
} `json:"video_info"`
ExtMediaAvailability struct {
Status string `json:"status"`
Reason string `json:"reason"`
} `json:"ext_media_availability"`
OriginalInfo struct {
Width int `json:"width"`
Height int `json:"height"`

View File

@ -26,6 +26,7 @@ type Video struct {
IsDownloaded bool `db:"is_downloaded"`
IsBlockedByDMCA bool `db:"is_blocked_by_dmca"`
IsGeoblocked bool `db:"is_geoblocked"`
IsGif bool `db:"is_gif"`
}
@ -78,6 +79,7 @@ func ParseAPIVideo(apiVideo APIExtendedMedia, tweet_id TweetID) Video {
IsDownloaded: false,
IsBlockedByDMCA: false,
IsGeoblocked: apiVideo.ExtMediaAvailability.Reason == "Geoblocked",
IsGif: apiVideo.Type == "animated_gif",
}
}

View File

@ -13,13 +13,13 @@ import (
func TestParseAPIVideo(t *testing.T) {
assert := assert.New(t)
require := require.New(t)
data, err := os.ReadFile("test_responses/tweet_content/video.json")
if err != nil {
panic(err)
}
require.NoError(err)
var apivideo APIExtendedMedia
err = json.Unmarshal(data, &apivideo)
require.NoError(t, err)
require.NoError(err)
tweet_id := TweetID(28)
video := ParseAPIVideo(apivideo, tweet_id)
@ -35,3 +35,18 @@ func TestParseAPIVideo(t *testing.T) {
assert.Equal(88300, video.Duration)
assert.False(video.IsDownloaded)
}
func TestParseGeoblockedVideo(t *testing.T) {
assert := assert.New(t)
require := require.New(t)
data, err := os.ReadFile("test_responses/tweet_content/video_geoblocked.json")
require.NoError(err)
var apivideo APIExtendedMedia
err = json.Unmarshal(data, &apivideo)
require.NoError(err)
tweet_id := TweetID(28)
video := ParseAPIVideo(apivideo, tweet_id)
assert.True(video.IsGeoblocked)
}