From a1faacaf6b2724a2025feae5bf9f97809e0e082f Mon Sep 17 00:00:00 2001 From: Alessio Date: Mon, 10 Jun 2024 19:59:08 -0700 Subject: [PATCH] Don't throw away all previous results if you get rate-limited, just save them --- cmd/twitter/helpers.go | 9 ++++ cmd/twitter/main.go | 24 +++++----- internal/webserver/handler_tweet_detail.go | 4 +- pkg/scraper/api_types_v2.go | 56 +++++++++++----------- pkg/scraper/search.go | 17 ------- 5 files changed, 52 insertions(+), 58 deletions(-) delete mode 100644 pkg/scraper/search.go diff --git a/cmd/twitter/helpers.go b/cmd/twitter/helpers.go index b62feb6..bbba422 100644 --- a/cmd/twitter/helpers.go +++ b/cmd/twitter/helpers.go @@ -2,6 +2,7 @@ package main import ( _ "embed" + "errors" "fmt" "os" "path/filepath" @@ -82,3 +83,11 @@ func get_default_profile() string { } return filepath.Join(app_data_dir, "twitter") } + +// Returns whether this error should be treated as a failure +func is_scrape_failure(err error) bool { + if err == nil || errors.Is(err, scraper.END_OF_FEED) || errors.Is(err, scraper.ErrRateLimited) { + return false + } + return true +} diff --git a/cmd/twitter/main.go b/cmd/twitter/main.go index cd7ed1a..2b3ca9b 100644 --- a/cmd/twitter/main.go +++ b/cmd/twitter/main.go @@ -261,7 +261,7 @@ func create_profile(target_dir string) { */ func fetch_user(handle scraper.UserHandle) { user, err := scraper.GetUser(handle) - if err != nil { + if is_scrape_failure(err) { die(err.Error(), false, -1) } log.Debug(user) @@ -288,7 +288,7 @@ func fetch_tweet_only(tweet_identifier string) { } tweet, err := scraper.GetTweet(tweet_id) - if err != nil { + if is_scrape_failure(err) { die(fmt.Sprintf("Error fetching tweet: %s", err.Error()), false, -1) } log.Debug(tweet) @@ -313,7 +313,7 @@ func fetch_tweet_conversation(tweet_identifier string, how_many int) { } trove, err := scraper.GetTweetFullAPIV2(tweet_id, how_many) - if err != nil { + if is_scrape_failure(err) { die(err.Error(), false, -1) } profile.SaveTweetTrove(trove, true) @@ -329,12 +329,12 @@ func fetch_tweet_conversation(tweet_identifier string, how_many int) { */ func fetch_user_feed(handle string, how_many int) { user, err := profile.GetUserByHandle(scraper.UserHandle(handle)) - if err != nil { + if is_scrape_failure(err) { die(fmt.Sprintf("Error getting user: %s\n %s", handle, err.Error()), false, -1) } trove, err := scraper.GetUserFeedGraphqlFor(user.ID, how_many) - if err != nil { + if is_scrape_failure(err) { die(fmt.Sprintf("Error scraping feed: %s\n %s", handle, err.Error()), false, -2) } profile.SaveTweetTrove(trove, true) @@ -349,7 +349,7 @@ func get_user_likes(handle string, how_many int) { } trove, err := scraper.GetUserLikes(user.ID, how_many) - if err != nil { + if is_scrape_failure(err) { die(fmt.Sprintf("Error scraping feed: %s\n %s", handle, err.Error()), false, -2) } profile.SaveTweetTrove(trove, true) @@ -364,7 +364,7 @@ func get_followees(handle string, how_many int) { } trove, err := scraper.GetFollowees(user.ID, how_many) - if err != nil { + if is_scrape_failure(err) { die(fmt.Sprintf("Error getting followees: %s\n %s", handle, err.Error()), false, -2) } profile.SaveTweetTrove(trove, true) @@ -378,7 +378,7 @@ func get_followers(handle string, how_many int) { die(fmt.Sprintf("Error getting user: %s\n %s", handle, err.Error()), false, -1) } trove, err := scraper.GetFollowers(user.ID, how_many) - if err != nil { + if is_scrape_failure(err) { die(fmt.Sprintf("Error getting followees: %s\n %s", handle, err.Error()), false, -2) } profile.SaveTweetTrove(trove, true) @@ -388,7 +388,7 @@ func get_followers(handle string, how_many int) { } func get_bookmarks(how_many int) { trove, err := scraper.GetBookmarks(how_many) - if err != nil { + if is_scrape_failure(err) { die(fmt.Sprintf("Error scraping bookmarks:\n %s", err.Error()), false, -2) } profile.SaveTweetTrove(trove, true) @@ -400,7 +400,7 @@ func get_bookmarks(how_many int) { } func fetch_timeline(is_following_only bool) { trove, err := scraper.GetHomeTimeline("", is_following_only) - if err != nil { + if is_scrape_failure(err) { die(fmt.Sprintf("Error fetching timeline:\n %s", err.Error()), false, -2) } profile.SaveTweetTrove(trove, true) @@ -437,7 +437,7 @@ func download_user_content(handle scraper.UserHandle) { func search(query string, how_many int) { trove, err := scraper.Search(query, how_many) - if err != nil { + if is_scrape_failure(err) { die(fmt.Sprintf("Error scraping search results: %s", err.Error()), false, -100) } profile.SaveTweetTrove(trove, true) @@ -506,7 +506,7 @@ func fetch_inbox(how_many int) { func fetch_dm(id string, how_many int) { room, err := profile.GetChatRoom(scraper.DMChatRoomID(id)) - if err != nil { + if is_scrape_failure(err) { panic(err) } max_id := scraper.DMMessageID(^uint(0) >> 1) diff --git a/internal/webserver/handler_tweet_detail.go b/internal/webserver/handler_tweet_detail.go index e673cb7..1c1f49a 100644 --- a/internal/webserver/handler_tweet_detail.go +++ b/internal/webserver/handler_tweet_detail.go @@ -50,10 +50,10 @@ func (app *Application) ensure_tweet(id scraper.TweetID, is_forced bool, is_conv if is_needing_scrape && !app.IsScrapingDisabled { trove, err := scraper.GetTweetFullAPIV2(id, 50) // TODO: parameterizable - if err == nil { + if err == nil || errors.Is(err, scraper.END_OF_FEED) || errors.Is(err, scraper.ErrRateLimited) { app.Profile.SaveTweetTrove(trove, false) go app.Profile.SaveTweetTrove(trove, true) // Download the content in the background - is_available = true + _, is_available = trove.Tweets[id] } else { app.ErrorLog.Print(err) // TODO: show error in UI diff --git a/pkg/scraper/api_types_v2.go b/pkg/scraper/api_types_v2.go index 45c9b41..47b2af5 100644 --- a/pkg/scraper/api_types_v2.go +++ b/pkg/scraper/api_types_v2.go @@ -895,7 +895,7 @@ func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) e for last_response.GetCursorBottom() != "" && len(response.GetMainInstruction().Entries) < count { fresh_response, err := pq.NextPage(api, last_response.GetCursorBottom()) if err != nil { - return fmt.Errorf("error getting next page for %#v: %w", pq, err) + return fmt.Errorf("error getting next page for %#v: %w", pq, err) // e.g., rate limited } if fresh_response.GetCursorBottom() == last_response.GetCursorBottom() && len(fresh_response.GetMainInstruction().Entries) == 0 { @@ -925,25 +925,31 @@ func (api *API) GetPaginatedQuery(pq PaginatedQuery, count int) (TweetTrove, err fmt.Printf("Paginating %d count\n", count) api_response, err := pq.NextPage(api, "") if err != nil { + // End of feed on the first call constitutes an empty result, so returning empty is OK return TweetTrove{}, fmt.Errorf("Error calling API to fetch query %#v:\n %w", pq, err) } if len(api_response.GetMainInstruction().Entries) < count && api_response.GetCursorBottom() != "" { err = api.GetMore(pq, &api_response, count) if errors.Is(err, END_OF_FEED) { - println("End of feed!") + log.Infof("End of feed!") + } else if errors.Is(err, ErrRateLimited) { + log.Errorf("Rate limited!") } else if err != nil { return TweetTrove{}, err } } - trove, err := pq.ToTweetTrove(api_response) - if err != nil { - return TweetTrove{}, fmt.Errorf("Error parsing the tweet trove for query %#v:\n %w", pq, err) + trove, err2 := pq.ToTweetTrove(api_response) + if err2 != nil { + return TweetTrove{}, fmt.Errorf("Error parsing the tweet trove for query %#v:\n %w", pq, err2) } fmt.Println("------------") - err = trove.PostProcess() - return trove, err + err2 = trove.PostProcess() + if err2 != nil { + return TweetTrove{}, fmt.Errorf("failed to post-process tweet trove: %w", err2) + } + return trove, err // `err` will be either nil, END_OF_FEED, or ErrRateLimited } // Get a User feed using the new GraphQL twitter api @@ -987,7 +993,6 @@ func (api *API) GetGraphqlFeedFor(user_id UserID, cursor string) (APIV2Response, var response APIV2Response err = api.do_http(url.String(), cursor, &response) - return response, err } @@ -1107,10 +1112,7 @@ func (api *API) GetUserLikes(user_id UserID, cursor string) (APIV2Response, erro var response APIV2Response err = api.do_http(url.String(), cursor, &response) - if err != nil { - panic(err) - } - return response, nil + return response, err } type PaginatedUserLikes struct { @@ -1176,10 +1178,7 @@ func (api *API) GetBookmarks(cursor string) (APIV2Response, error) { var response APIV2Response err = api.do_http(url.String(), cursor, &response) - if err != nil { - panic(err) - } - return response, nil + return response, err } type PaginatedBookmarks struct { @@ -1259,12 +1258,9 @@ func (api *API) GetHomeTimeline(cursor string, is_following_only bool) (TweetTro panic(err) } err = api.do_http_POST(url, string(body_bytes), &response) - if err != nil { - panic(err) - } - trove, err := response.ToTweetTrove() - if err != nil { - return TweetTrove{}, err + trove, err2 := response.ToTweetTrove() + if err2 != nil { + return TweetTrove{}, err2 } return trove, err } @@ -1312,11 +1308,7 @@ func (api API) GetUser(handle UserHandle) (APIUser, error) { var response UserResponse err = api.do_http(url.String(), "", &response) - if err != nil { - panic(err) - } - - return response.ConvertToAPIUser(), nil + return response.ConvertToAPIUser(), err } func (api *API) Search(query string, cursor string) (APIV2Response, error) { @@ -1373,3 +1365,13 @@ func (p PaginatedSearch) NextPage(api *API, cursor string) (APIV2Response, error func (p PaginatedSearch) ToTweetTrove(r APIV2Response) (TweetTrove, error) { return r.ToTweetTrove() } + +// TODO: Search modes: +// - regular ("top") +// - latest / "live" +// - search for users +// - photos +// - videos +func Search(query string, min_results int) (trove TweetTrove, err error) { + return the_api.GetPaginatedQuery(PaginatedSearch{query}, min_results) +} diff --git a/pkg/scraper/search.go b/pkg/scraper/search.go deleted file mode 100644 index 901472a..0000000 --- a/pkg/scraper/search.go +++ /dev/null @@ -1,17 +0,0 @@ -package scraper - -func TimestampToDateString(timestamp int) string { - panic("???") // TODO -} - -/** - * TODO: Search modes: - * - regular ("top") - * - latest / "live" - * - search for users - * - photos - * - videos - */ -func Search(query string, min_results int) (trove TweetTrove, err error) { - return the_api.GetPaginatedQuery(PaginatedSearch{query}, min_results) -}