From 6eac56183d5bfe849f99affa6958dab505a088cc Mon Sep 17 00:00:00 2001 From: Alessio Date: Fri, 7 Jan 2022 13:40:22 -0500 Subject: [PATCH] Allow fetching of banned users --- cmd/tests.sh | 5 +++ persistence/media_download.go | 14 ++++++- persistence/user_queries.go | 6 +++ scraper/api_request_utils.go | 69 ++++++++++++++++++----------------- 4 files changed, 59 insertions(+), 35 deletions(-) diff --git a/cmd/tests.sh b/cmd/tests.sh index 9e49bde..c88bd14 100755 --- a/cmd/tests.sh +++ b/cmd/tests.sh @@ -207,6 +207,11 @@ test "$(sqlite3 twitter.db "select choice1, choice2, choice3, choice4 from polls test "$(sqlite3 twitter.db "select choice1_votes, choice2_votes, choice3_votes, choice4_votes from polls where tweet_id = 1465534109573390348")" = "1593|624|778|1138" +# Test fetching a banned user +test $(sqlite3 twitter.db "select is_content_downloaded from users where handle='kanesays23'") = "0" +tw fetch_user kanesays23 +test "$(sqlite3 twitter.db "select is_content_downloaded, is_banned from users where handle='kanesays23'")" = "1|1" + # TODO: Maybe this file should be broken up into multiple test scripts echo -e "\033[32mAll tests passed. Finished successfully.\033[0m" diff --git a/persistence/media_download.go b/persistence/media_download.go index 649f49a..40bf1ac 100644 --- a/persistence/media_download.go +++ b/persistence/media_download.go @@ -147,8 +147,18 @@ func (p Profile) DownloadUserContentFor(u *scraper.User) error { * Enable injecting a custom MediaDownloader (i.e., for testing) */ func (p Profile) DownloadUserContentWithInjector(u *scraper.User, downloader MediaDownloader) error { - outfile := path.Join(p.ProfileDir, "profile_images", u.ProfileImageLocalPath) - err := downloader.Curl(u.ProfileImageUrl, outfile) + var outfile string + var target_url string + + if u.ProfileImageUrl == "" { + outfile = path.Join(p.ProfileDir, "profile_images", path.Base(scraper.DEFAULT_PROFILE_IMAGE_URL)) + target_url = scraper.DEFAULT_PROFILE_IMAGE_URL + } else { + outfile = path.Join(p.ProfileDir, "profile_images", u.ProfileImageLocalPath) + target_url = u.ProfileImageUrl + } + + err := downloader.Curl(target_url, outfile) if err != nil { return err } diff --git a/persistence/user_queries.go b/persistence/user_queries.go index 31be56a..75c3a05 100644 --- a/persistence/user_queries.go +++ b/persistence/user_queries.go @@ -152,6 +152,8 @@ func (p Profile) GetUserByID(id scraper.UserID) (scraper.User, error) { /** * Returns `true` if content download is needed, `false` otherwise * + * If the user is banned, returns false because downloading will be impossible. + * * If: * - the user isn't in the DB at all (first time scraping), OR * - `is_content_downloaded` is false in the DB, OR @@ -162,6 +164,10 @@ func (p Profile) GetUserByID(id scraper.UserID) (scraper.User, error) { * why the No Worsening Principle is needed. */ func (p Profile) CheckUserContentDownloadNeeded(user scraper.User) bool { + if user.IsBanned { + // Check `is_banned` on the live user, since he may have been un-banned since last scraped + return false + } row := p.DB.QueryRow(`select is_content_downloaded, profile_image_url, banner_image_url from users where id = ?`, user.ID) var is_content_downloaded bool diff --git a/scraper/api_request_utils.go b/scraper/api_request_utils.go index cb906fd..1680f2f 100644 --- a/scraper/api_request_utils.go +++ b/scraper/api_request_utils.go @@ -173,49 +173,52 @@ func UpdateQueryCursor(req *http.Request, new_cursor string, is_tweet bool) { func (api API) GetUser(handle UserHandle) (APIUser, error) { - client := &http.Client{Timeout: 10 * time.Second} - req, err := http.NewRequest("GET", "https://api.twitter.com/graphql/4S2ihIKfF3xhp-ENxvUAfQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22" + string(handle) + "%22%2C%22withHighlightedLabel%22%3Atrue%7D", nil) - if err != nil { - return APIUser{}, err - } + client := &http.Client{Timeout: 10 * time.Second} + req, err := http.NewRequest("GET", "https://api.twitter.com/graphql/4S2ihIKfF3xhp-ENxvUAfQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22" + string(handle) + "%22%2C%22withHighlightedLabel%22%3Atrue%7D", nil) + if err != nil { + return APIUser{}, err + } err = ApiRequestAddTokens(req) if err != nil { return APIUser{}, err } - var response UserResponse + var response UserResponse for retries := 0; retries < 3; retries += 1 { - resp, err := client.Do(req) - if err != nil { - return APIUser{}, err - } - defer resp.Body.Close() + resp, err := client.Do(req) + if err != nil { + return APIUser{}, err + } + defer resp.Body.Close() - // Sometimes it randomly gives 403 Forbidden. API's fault, not ours - // We check for this below - if !(resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusForbidden) { - content, _ := ioutil.ReadAll(resp.Body) - return APIUser{}, fmt.Errorf("response status %s: %s", resp.Status, content) - } + // Sometimes it randomly gives 403 Forbidden. API's fault, not ours + // We check for this below + if !(resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusForbidden) { + content, _ := ioutil.ReadAll(resp.Body) + return APIUser{}, fmt.Errorf("response status %s: %s", resp.Status, content) + } - body, err := ioutil.ReadAll(resp.Body) - if err != nil { - return APIUser{}, err - } + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return APIUser{}, err + } - err = json.Unmarshal(body, &response) - if err != nil { - return APIUser{}, err - } + err = json.Unmarshal(body, &response) + if err != nil { + return APIUser{}, err + } - if len(response.Errors) == 0 { - break - } - - // Reset the response (remove the Errors) - response = UserResponse{} - } - return response.ConvertToAPIUser(), err + // Retry ONLY if the error is code 50 (random authentication failure), NOT on real errors + if len(response.Errors) == 1 && response.Errors[0].Code == 50 { + // Reset the response (remove the Errors) + response = UserResponse{} + continue + } else { + // Do not retry on real errors + break + } + } + return response.ConvertToAPIUser(), err } func (api API) Search(query string, cursor string) (TweetResponse, error) {