Move media downloader from persistence to scraper package; add 429 Rate Limited error type
This commit is contained in:
parent
1ba4f91463
commit
aa05708e20
@ -1,11 +1,8 @@
|
|||||||
package persistence
|
package persistence
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"net/http"
|
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
"strings"
|
"strings"
|
||||||
@ -19,47 +16,15 @@ type MediaDownloader interface {
|
|||||||
|
|
||||||
type DefaultDownloader struct{}
|
type DefaultDownloader struct{}
|
||||||
|
|
||||||
var ErrorDMCA error = errors.New("video is DMCAed, unable to download (HTTP 403 Forbidden)")
|
|
||||||
|
|
||||||
// Download a file over HTTP and save it.
|
// Download a file over HTTP and save it.
|
||||||
//
|
//
|
||||||
// args:
|
// args:
|
||||||
// - url: the remote file to download
|
// - url: the remote file to download
|
||||||
// - outpath: the path on disk to save it to
|
// - outpath: the path on disk to save it to
|
||||||
func (d DefaultDownloader) Curl(url string, outpath string) error {
|
func (d DefaultDownloader) Curl(url string, outpath string) error {
|
||||||
fmt.Println(url)
|
data, err := scraper.DownloadMedia(url)
|
||||||
resp, err := http.Get(url)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("Error executing HTTP GET(%q):\n %w", url, err)
|
return fmt.Errorf("downloading %q:\n %w", url, err)
|
||||||
}
|
|
||||||
|
|
||||||
if resp.StatusCode == 403 {
|
|
||||||
var response struct {
|
|
||||||
Error_response string `json:"error_response"`
|
|
||||||
}
|
|
||||||
body, err := io.ReadAll(resp.Body)
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
fmt.Println(string(body))
|
|
||||||
|
|
||||||
err = json.Unmarshal(body, &response)
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
if response.Error_response == "Dmcaed" {
|
|
||||||
return ErrorDMCA
|
|
||||||
}
|
|
||||||
// Not a DCMA; fall through
|
|
||||||
}
|
|
||||||
|
|
||||||
if resp.StatusCode != 200 {
|
|
||||||
return fmt.Errorf("Error %s: %s", url, resp.Status)
|
|
||||||
}
|
|
||||||
|
|
||||||
data, err := io.ReadAll(resp.Body)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("Error downloading image %s:\n %w", url, err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure the output directory exists
|
// Ensure the output directory exists
|
||||||
@ -96,7 +61,7 @@ func (p Profile) download_tweet_video(v *scraper.Video, downloader MediaDownload
|
|||||||
outfile := path.Join(p.ProfileDir, "videos", v.LocalFilename)
|
outfile := path.Join(p.ProfileDir, "videos", v.LocalFilename)
|
||||||
err := downloader.Curl(v.RemoteURL, outfile)
|
err := downloader.Curl(v.RemoteURL, outfile)
|
||||||
|
|
||||||
if errors.Is(err, ErrorDMCA) {
|
if errors.Is(err, scraper.ErrorDMCA) {
|
||||||
v.IsDownloaded = false
|
v.IsDownloaded = false
|
||||||
v.IsBlockedByDMCA = true
|
v.IsBlockedByDMCA = true
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
|
@ -9,4 +9,6 @@ var (
|
|||||||
ErrDoesntExist = errors.New("Doesn't exist")
|
ErrDoesntExist = errors.New("Doesn't exist")
|
||||||
EXTERNAL_API_ERROR = errors.New("Unexpected result from external API")
|
EXTERNAL_API_ERROR = errors.New("Unexpected result from external API")
|
||||||
ErrorIsTombstone = errors.New("tweet is a tombstone")
|
ErrorIsTombstone = errors.New("tweet is a tombstone")
|
||||||
|
ErrRateLimited = errors.New("rate limited")
|
||||||
|
ErrorDMCA = errors.New("video is DMCAed, unable to download (HTTP 403 Forbidden)")
|
||||||
)
|
)
|
||||||
|
@ -165,23 +165,19 @@ func (api *API) do_http_POST(url string, body string, result interface{}) error
|
|||||||
|
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
if resp.StatusCode != 200 {
|
respBody, err := io.ReadAll(resp.Body)
|
||||||
content, err := io.ReadAll(resp.Body)
|
if err != nil {
|
||||||
if err != nil {
|
panic(err)
|
||||||
panic(err)
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
if resp.StatusCode != 200 {
|
||||||
responseHeaders := ""
|
responseHeaders := ""
|
||||||
for header := range resp.Header {
|
for header := range resp.Header {
|
||||||
responseHeaders += fmt.Sprintf(" %s: %s\n", header, resp.Header.Get(header))
|
responseHeaders += fmt.Sprintf(" %s: %s\n", header, resp.Header.Get(header))
|
||||||
}
|
}
|
||||||
return fmt.Errorf("HTTP %s\n%s\n%s", resp.Status, responseHeaders, content)
|
return fmt.Errorf("HTTP %s\n%s\n%s", resp.Status, responseHeaders, respBody)
|
||||||
}
|
}
|
||||||
|
|
||||||
respBody, err := io.ReadAll(resp.Body)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("Error reading response body:\n %w", err)
|
|
||||||
}
|
|
||||||
log.Debug(string(respBody))
|
log.Debug(string(respBody))
|
||||||
|
|
||||||
err = json.Unmarshal(respBody, result)
|
err = json.Unmarshal(respBody, result)
|
||||||
@ -221,23 +217,25 @@ func (api *API) do_http(url string, cursor string, result interface{}) error {
|
|||||||
api.update_csrf_token()
|
api.update_csrf_token()
|
||||||
}
|
}
|
||||||
|
|
||||||
if resp.StatusCode != 200 && resp.StatusCode != 403 {
|
if resp.StatusCode == 429 {
|
||||||
content, err := io.ReadAll(resp.Body)
|
// "Too many requests" => rate limited
|
||||||
if err != nil {
|
reset_at := TimestampFromUnix(int64(int_or_panic(resp.Header.Get("X-Rate-Limit-Reset"))))
|
||||||
panic(err)
|
return fmt.Errorf("%w (resets at %d, which is in %s)", ErrRateLimited, reset_at.Unix(), time.Until(reset_at.Time).String())
|
||||||
}
|
|
||||||
|
|
||||||
responseHeaders := ""
|
|
||||||
for header := range resp.Header {
|
|
||||||
responseHeaders += fmt.Sprintf(" %s: %s\n", header, resp.Header.Get(header))
|
|
||||||
}
|
|
||||||
return fmt.Errorf("HTTP Error. HTTP %s\n%s\nbody: %s", resp.Status, responseHeaders, content)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
body, err := io.ReadAll(resp.Body)
|
body, err := io.ReadAll(resp.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("Error reading response body:\n %w", err)
|
panic(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if resp.StatusCode != 200 && resp.StatusCode != 403 {
|
||||||
|
responseHeaders := ""
|
||||||
|
for header := range resp.Header {
|
||||||
|
responseHeaders += fmt.Sprintf(" %s: %s\n", header, resp.Header.Get(header))
|
||||||
|
}
|
||||||
|
return fmt.Errorf("HTTP Error. HTTP %s\n%s\nbody: %s", resp.Status, responseHeaders, body)
|
||||||
|
}
|
||||||
|
|
||||||
log.Debug(string(body))
|
log.Debug(string(body))
|
||||||
|
|
||||||
err = json.Unmarshal(body, result)
|
err = json.Unmarshal(body, result)
|
||||||
@ -370,3 +368,76 @@ func (api *API) GetMoreReplies(tweet_id TweetID, response *TweetResponse, max_re
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func DownloadMedia(url string) ([]byte, error) {
|
||||||
|
return the_api.DownloadMedia(url)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (api *API) DownloadMedia(remote_url string) ([]byte, error) {
|
||||||
|
fmt.Printf("Downloading: %s\n", remote_url)
|
||||||
|
req, err := http.NewRequest("GET", remote_url, nil)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
// api.add_authentication_headers(req)
|
||||||
|
// req.Header.Set("Referer", "https://twitter.com/") // DM embedded images require this header
|
||||||
|
|
||||||
|
resp, err := api.Client.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
return []byte{}, fmt.Errorf("Error executing HTTP request:\n %w", err)
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
if api.IsAuthenticated {
|
||||||
|
// New request has been made, so the cookie will be changed; update the csrf to match
|
||||||
|
api.update_csrf_token()
|
||||||
|
}
|
||||||
|
|
||||||
|
body, err := io.ReadAll(resp.Body)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.StatusCode == 403 {
|
||||||
|
var response struct {
|
||||||
|
Error_response string `json:"error_response"`
|
||||||
|
}
|
||||||
|
fmt.Println(string(body))
|
||||||
|
|
||||||
|
err = json.Unmarshal(body, &response)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
if response.Error_response == "Dmcaed" {
|
||||||
|
return body, ErrorDMCA
|
||||||
|
}
|
||||||
|
// Not a DCMA; fall through
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.StatusCode != 200 {
|
||||||
|
print_curl_cmd(req, api.Client.Jar.Cookies(url))
|
||||||
|
|
||||||
|
responseHeaders := ""
|
||||||
|
for header := range resp.Header {
|
||||||
|
responseHeaders += fmt.Sprintf(" %s: %s\n", header, resp.Header.Get(header))
|
||||||
|
}
|
||||||
|
log.Debug(responseHeaders)
|
||||||
|
return body, fmt.Errorf("HTTP Error. HTTP %s\n%s\nbody: %s", resp.Status, responseHeaders, body)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Status code is HTTP 200
|
||||||
|
return body, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func print_curl_cmd(r http.Request, cookies []*http.Cookie) {
|
||||||
|
fmt.Printf("curl -X %s %q \\\n", r.Method, r.URL.String())
|
||||||
|
for header := range r.Header {
|
||||||
|
fmt.Printf(" -H '%s: %s' \\\n", header, r.Header.Get(header))
|
||||||
|
}
|
||||||
|
fmt.Printf(" -H 'Cookie: ")
|
||||||
|
for _, c := range cookies {
|
||||||
|
fmt.Printf("%s=%s;", c.Name, c.Value)
|
||||||
|
}
|
||||||
|
fmt.Printf("' \\\n")
|
||||||
|
fmt.Printf(" --compressed\n")
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user