Handle media download 404s gracefully

This commit is contained in:
Alessio 2024-07-28 12:50:00 -07:00
parent d0c23b392b
commit 8aca12695b
4 changed files with 14 additions and 8 deletions

View File

@ -65,7 +65,7 @@ func (p Profile) SaveTweetTrove(trove TweetTrove, should_download bool) {
if should_download { if should_download {
err = p.DownloadTweetContentFor(&t) err = p.DownloadTweetContentFor(&t)
if errors.Is(err, ErrRequestTimeout) { if errors.Is(err, ErrRequestTimeout) || errors.Is(err, ErrMediaDownload404) {
// Forget about it; if it's important someone will try again // Forget about it; if it's important someone will try again
fmt.Printf("Failed to download tweet ID %d: %s\n", t.ID, err.Error()) fmt.Printf("Failed to download tweet ID %d: %s\n", t.ID, err.Error())
} else if err != nil { } else if err != nil {

View File

@ -11,6 +11,7 @@ var (
ErrorIsTombstone = errors.New("tweet is a tombstone") ErrorIsTombstone = errors.New("tweet is a tombstone")
ErrRateLimited = errors.New("rate limited") ErrRateLimited = errors.New("rate limited")
ErrorDMCA = errors.New("video is DMCAed, unable to download (HTTP 403 Forbidden)") ErrorDMCA = errors.New("video is DMCAed, unable to download (HTTP 403 Forbidden)")
ErrMediaDownload404 = errors.New("media download HTTP 404")
// These are not API errors, but network errors generally // These are not API errors, but network errors generally
ErrNoInternet = errors.New("no internet connection") ErrNoInternet = errors.New("no internet connection")

View File

@ -452,6 +452,11 @@ func (api *API) DownloadMedia(remote_url string) ([]byte, error) {
// Not a DCMA; fall through // Not a DCMA; fall through
} }
if resp.StatusCode == 404 {
log.Debugf("Media download 404 (%s)", remote_url)
return body, ErrMediaDownload404
}
if resp.StatusCode != 200 { if resp.StatusCode != 200 {
url, err := url.Parse(remote_url) url, err := url.Parse(remote_url)
if err != nil { if err != nil {

View File

@ -2,10 +2,10 @@ package scraper
import ( import (
"fmt" "fmt"
"log"
"net/url" "net/url"
"path" "path"
"regexp" "regexp"
"log"
) )
type Url struct { type Url struct {