From aa05708e20a8d855b0a0553d98f4a4f8f1ae92bd Mon Sep 17 00:00:00 2001 From: Alessio Date: Mon, 11 Mar 2024 12:57:58 -0700 Subject: [PATCH] Move media downloader from persistence to scraper package; add 429 Rate Limited error type --- pkg/persistence/media_download.go | 41 +---------- pkg/scraper/api_errors.go | 2 + pkg/scraper/api_request_utils.go | 115 ++++++++++++++++++++++++------ 3 files changed, 98 insertions(+), 60 deletions(-) diff --git a/pkg/persistence/media_download.go b/pkg/persistence/media_download.go index 730a4c4..807159e 100644 --- a/pkg/persistence/media_download.go +++ b/pkg/persistence/media_download.go @@ -1,11 +1,8 @@ package persistence import ( - "encoding/json" "errors" "fmt" - "io" - "net/http" "os" "path" "strings" @@ -19,47 +16,15 @@ type MediaDownloader interface { type DefaultDownloader struct{} -var ErrorDMCA error = errors.New("video is DMCAed, unable to download (HTTP 403 Forbidden)") - // Download a file over HTTP and save it. // // args: // - url: the remote file to download // - outpath: the path on disk to save it to func (d DefaultDownloader) Curl(url string, outpath string) error { - fmt.Println(url) - resp, err := http.Get(url) + data, err := scraper.DownloadMedia(url) if err != nil { - return fmt.Errorf("Error executing HTTP GET(%q):\n %w", url, err) - } - - if resp.StatusCode == 403 { - var response struct { - Error_response string `json:"error_response"` - } - body, err := io.ReadAll(resp.Body) - if err != nil { - panic(err) - } - fmt.Println(string(body)) - - err = json.Unmarshal(body, &response) - if err != nil { - panic(err) - } - if response.Error_response == "Dmcaed" { - return ErrorDMCA - } - // Not a DCMA; fall through - } - - if resp.StatusCode != 200 { - return fmt.Errorf("Error %s: %s", url, resp.Status) - } - - data, err := io.ReadAll(resp.Body) - if err != nil { - return fmt.Errorf("Error downloading image %s:\n %w", url, err) + return fmt.Errorf("downloading %q:\n %w", url, err) } // Ensure the output directory exists @@ -96,7 +61,7 @@ func (p Profile) download_tweet_video(v *scraper.Video, downloader MediaDownload outfile := path.Join(p.ProfileDir, "videos", v.LocalFilename) err := downloader.Curl(v.RemoteURL, outfile) - if errors.Is(err, ErrorDMCA) { + if errors.Is(err, scraper.ErrorDMCA) { v.IsDownloaded = false v.IsBlockedByDMCA = true } else if err != nil { diff --git a/pkg/scraper/api_errors.go b/pkg/scraper/api_errors.go index 24876a9..4acd14b 100644 --- a/pkg/scraper/api_errors.go +++ b/pkg/scraper/api_errors.go @@ -9,4 +9,6 @@ var ( ErrDoesntExist = errors.New("Doesn't exist") EXTERNAL_API_ERROR = errors.New("Unexpected result from external API") ErrorIsTombstone = errors.New("tweet is a tombstone") + ErrRateLimited = errors.New("rate limited") + ErrorDMCA = errors.New("video is DMCAed, unable to download (HTTP 403 Forbidden)") ) diff --git a/pkg/scraper/api_request_utils.go b/pkg/scraper/api_request_utils.go index 4c2ef73..d69eb1b 100644 --- a/pkg/scraper/api_request_utils.go +++ b/pkg/scraper/api_request_utils.go @@ -165,23 +165,19 @@ func (api *API) do_http_POST(url string, body string, result interface{}) error defer resp.Body.Close() - if resp.StatusCode != 200 { - content, err := io.ReadAll(resp.Body) - if err != nil { - panic(err) - } + respBody, err := io.ReadAll(resp.Body) + if err != nil { + panic(err) + } + if resp.StatusCode != 200 { responseHeaders := "" for header := range resp.Header { responseHeaders += fmt.Sprintf(" %s: %s\n", header, resp.Header.Get(header)) } - return fmt.Errorf("HTTP %s\n%s\n%s", resp.Status, responseHeaders, content) + return fmt.Errorf("HTTP %s\n%s\n%s", resp.Status, responseHeaders, respBody) } - respBody, err := io.ReadAll(resp.Body) - if err != nil { - return fmt.Errorf("Error reading response body:\n %w", err) - } log.Debug(string(respBody)) err = json.Unmarshal(respBody, result) @@ -221,23 +217,25 @@ func (api *API) do_http(url string, cursor string, result interface{}) error { api.update_csrf_token() } - if resp.StatusCode != 200 && resp.StatusCode != 403 { - content, err := io.ReadAll(resp.Body) - if err != nil { - panic(err) - } - - responseHeaders := "" - for header := range resp.Header { - responseHeaders += fmt.Sprintf(" %s: %s\n", header, resp.Header.Get(header)) - } - return fmt.Errorf("HTTP Error. HTTP %s\n%s\nbody: %s", resp.Status, responseHeaders, content) + if resp.StatusCode == 429 { + // "Too many requests" => rate limited + reset_at := TimestampFromUnix(int64(int_or_panic(resp.Header.Get("X-Rate-Limit-Reset")))) + return fmt.Errorf("%w (resets at %d, which is in %s)", ErrRateLimited, reset_at.Unix(), time.Until(reset_at.Time).String()) } body, err := io.ReadAll(resp.Body) if err != nil { - return fmt.Errorf("Error reading response body:\n %w", err) + panic(err) } + + if resp.StatusCode != 200 && resp.StatusCode != 403 { + responseHeaders := "" + for header := range resp.Header { + responseHeaders += fmt.Sprintf(" %s: %s\n", header, resp.Header.Get(header)) + } + return fmt.Errorf("HTTP Error. HTTP %s\n%s\nbody: %s", resp.Status, responseHeaders, body) + } + log.Debug(string(body)) err = json.Unmarshal(body, result) @@ -370,3 +368,76 @@ func (api *API) GetMoreReplies(tweet_id TweetID, response *TweetResponse, max_re } return nil } + +func DownloadMedia(url string) ([]byte, error) { + return the_api.DownloadMedia(url) +} + +func (api *API) DownloadMedia(remote_url string) ([]byte, error) { + fmt.Printf("Downloading: %s\n", remote_url) + req, err := http.NewRequest("GET", remote_url, nil) + if err != nil { + panic(err) + } + // api.add_authentication_headers(req) + // req.Header.Set("Referer", "https://twitter.com/") // DM embedded images require this header + + resp, err := api.Client.Do(req) + if err != nil { + return []byte{}, fmt.Errorf("Error executing HTTP request:\n %w", err) + } + defer resp.Body.Close() + + if api.IsAuthenticated { + // New request has been made, so the cookie will be changed; update the csrf to match + api.update_csrf_token() + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + panic(err) + } + + if resp.StatusCode == 403 { + var response struct { + Error_response string `json:"error_response"` + } + fmt.Println(string(body)) + + err = json.Unmarshal(body, &response) + if err != nil { + panic(err) + } + if response.Error_response == "Dmcaed" { + return body, ErrorDMCA + } + // Not a DCMA; fall through + } + + if resp.StatusCode != 200 { + print_curl_cmd(req, api.Client.Jar.Cookies(url)) + + responseHeaders := "" + for header := range resp.Header { + responseHeaders += fmt.Sprintf(" %s: %s\n", header, resp.Header.Get(header)) + } + log.Debug(responseHeaders) + return body, fmt.Errorf("HTTP Error. HTTP %s\n%s\nbody: %s", resp.Status, responseHeaders, body) + } + + // Status code is HTTP 200 + return body, nil +} + +func print_curl_cmd(r http.Request, cookies []*http.Cookie) { + fmt.Printf("curl -X %s %q \\\n", r.Method, r.URL.String()) + for header := range r.Header { + fmt.Printf(" -H '%s: %s' \\\n", header, r.Header.Get(header)) + } + fmt.Printf(" -H 'Cookie: ") + for _, c := range cookies { + fmt.Printf("%s=%s;", c.Name, c.Value) + } + fmt.Printf("' \\\n") + fmt.Printf(" --compressed\n") +}