Don't throw away all previous results if you get rate-limited, just save them

This commit is contained in:
Alessio 2024-06-10 19:59:08 -07:00
parent b23a6a7e05
commit a1faacaf6b
5 changed files with 52 additions and 58 deletions

View File

@ -2,6 +2,7 @@ package main
import (
_ "embed"
"errors"
"fmt"
"os"
"path/filepath"
@ -82,3 +83,11 @@ func get_default_profile() string {
}
return filepath.Join(app_data_dir, "twitter")
}
// Returns whether this error should be treated as a failure
func is_scrape_failure(err error) bool {
if err == nil || errors.Is(err, scraper.END_OF_FEED) || errors.Is(err, scraper.ErrRateLimited) {
return false
}
return true
}

View File

@ -261,7 +261,7 @@ func create_profile(target_dir string) {
*/
func fetch_user(handle scraper.UserHandle) {
user, err := scraper.GetUser(handle)
if err != nil {
if is_scrape_failure(err) {
die(err.Error(), false, -1)
}
log.Debug(user)
@ -288,7 +288,7 @@ func fetch_tweet_only(tweet_identifier string) {
}
tweet, err := scraper.GetTweet(tweet_id)
if err != nil {
if is_scrape_failure(err) {
die(fmt.Sprintf("Error fetching tweet: %s", err.Error()), false, -1)
}
log.Debug(tweet)
@ -313,7 +313,7 @@ func fetch_tweet_conversation(tweet_identifier string, how_many int) {
}
trove, err := scraper.GetTweetFullAPIV2(tweet_id, how_many)
if err != nil {
if is_scrape_failure(err) {
die(err.Error(), false, -1)
}
profile.SaveTweetTrove(trove, true)
@ -329,12 +329,12 @@ func fetch_tweet_conversation(tweet_identifier string, how_many int) {
*/
func fetch_user_feed(handle string, how_many int) {
user, err := profile.GetUserByHandle(scraper.UserHandle(handle))
if err != nil {
if is_scrape_failure(err) {
die(fmt.Sprintf("Error getting user: %s\n %s", handle, err.Error()), false, -1)
}
trove, err := scraper.GetUserFeedGraphqlFor(user.ID, how_many)
if err != nil {
if is_scrape_failure(err) {
die(fmt.Sprintf("Error scraping feed: %s\n %s", handle, err.Error()), false, -2)
}
profile.SaveTweetTrove(trove, true)
@ -349,7 +349,7 @@ func get_user_likes(handle string, how_many int) {
}
trove, err := scraper.GetUserLikes(user.ID, how_many)
if err != nil {
if is_scrape_failure(err) {
die(fmt.Sprintf("Error scraping feed: %s\n %s", handle, err.Error()), false, -2)
}
profile.SaveTweetTrove(trove, true)
@ -364,7 +364,7 @@ func get_followees(handle string, how_many int) {
}
trove, err := scraper.GetFollowees(user.ID, how_many)
if err != nil {
if is_scrape_failure(err) {
die(fmt.Sprintf("Error getting followees: %s\n %s", handle, err.Error()), false, -2)
}
profile.SaveTweetTrove(trove, true)
@ -378,7 +378,7 @@ func get_followers(handle string, how_many int) {
die(fmt.Sprintf("Error getting user: %s\n %s", handle, err.Error()), false, -1)
}
trove, err := scraper.GetFollowers(user.ID, how_many)
if err != nil {
if is_scrape_failure(err) {
die(fmt.Sprintf("Error getting followees: %s\n %s", handle, err.Error()), false, -2)
}
profile.SaveTweetTrove(trove, true)
@ -388,7 +388,7 @@ func get_followers(handle string, how_many int) {
}
func get_bookmarks(how_many int) {
trove, err := scraper.GetBookmarks(how_many)
if err != nil {
if is_scrape_failure(err) {
die(fmt.Sprintf("Error scraping bookmarks:\n %s", err.Error()), false, -2)
}
profile.SaveTweetTrove(trove, true)
@ -400,7 +400,7 @@ func get_bookmarks(how_many int) {
}
func fetch_timeline(is_following_only bool) {
trove, err := scraper.GetHomeTimeline("", is_following_only)
if err != nil {
if is_scrape_failure(err) {
die(fmt.Sprintf("Error fetching timeline:\n %s", err.Error()), false, -2)
}
profile.SaveTweetTrove(trove, true)
@ -437,7 +437,7 @@ func download_user_content(handle scraper.UserHandle) {
func search(query string, how_many int) {
trove, err := scraper.Search(query, how_many)
if err != nil {
if is_scrape_failure(err) {
die(fmt.Sprintf("Error scraping search results: %s", err.Error()), false, -100)
}
profile.SaveTweetTrove(trove, true)
@ -506,7 +506,7 @@ func fetch_inbox(how_many int) {
func fetch_dm(id string, how_many int) {
room, err := profile.GetChatRoom(scraper.DMChatRoomID(id))
if err != nil {
if is_scrape_failure(err) {
panic(err)
}
max_id := scraper.DMMessageID(^uint(0) >> 1)

View File

@ -50,10 +50,10 @@ func (app *Application) ensure_tweet(id scraper.TweetID, is_forced bool, is_conv
if is_needing_scrape && !app.IsScrapingDisabled {
trove, err := scraper.GetTweetFullAPIV2(id, 50) // TODO: parameterizable
if err == nil {
if err == nil || errors.Is(err, scraper.END_OF_FEED) || errors.Is(err, scraper.ErrRateLimited) {
app.Profile.SaveTweetTrove(trove, false)
go app.Profile.SaveTweetTrove(trove, true) // Download the content in the background
is_available = true
_, is_available = trove.Tweets[id]
} else {
app.ErrorLog.Print(err)
// TODO: show error in UI

View File

@ -895,7 +895,7 @@ func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) e
for last_response.GetCursorBottom() != "" && len(response.GetMainInstruction().Entries) < count {
fresh_response, err := pq.NextPage(api, last_response.GetCursorBottom())
if err != nil {
return fmt.Errorf("error getting next page for %#v: %w", pq, err)
return fmt.Errorf("error getting next page for %#v: %w", pq, err) // e.g., rate limited
}
if fresh_response.GetCursorBottom() == last_response.GetCursorBottom() && len(fresh_response.GetMainInstruction().Entries) == 0 {
@ -925,25 +925,31 @@ func (api *API) GetPaginatedQuery(pq PaginatedQuery, count int) (TweetTrove, err
fmt.Printf("Paginating %d count\n", count)
api_response, err := pq.NextPage(api, "")
if err != nil {
// End of feed on the first call constitutes an empty result, so returning empty is OK
return TweetTrove{}, fmt.Errorf("Error calling API to fetch query %#v:\n %w", pq, err)
}
if len(api_response.GetMainInstruction().Entries) < count && api_response.GetCursorBottom() != "" {
err = api.GetMore(pq, &api_response, count)
if errors.Is(err, END_OF_FEED) {
println("End of feed!")
log.Infof("End of feed!")
} else if errors.Is(err, ErrRateLimited) {
log.Errorf("Rate limited!")
} else if err != nil {
return TweetTrove{}, err
}
}
trove, err := pq.ToTweetTrove(api_response)
if err != nil {
return TweetTrove{}, fmt.Errorf("Error parsing the tweet trove for query %#v:\n %w", pq, err)
trove, err2 := pq.ToTweetTrove(api_response)
if err2 != nil {
return TweetTrove{}, fmt.Errorf("Error parsing the tweet trove for query %#v:\n %w", pq, err2)
}
fmt.Println("------------")
err = trove.PostProcess()
return trove, err
err2 = trove.PostProcess()
if err2 != nil {
return TweetTrove{}, fmt.Errorf("failed to post-process tweet trove: %w", err2)
}
return trove, err // `err` will be either nil, END_OF_FEED, or ErrRateLimited
}
// Get a User feed using the new GraphQL twitter api
@ -987,7 +993,6 @@ func (api *API) GetGraphqlFeedFor(user_id UserID, cursor string) (APIV2Response,
var response APIV2Response
err = api.do_http(url.String(), cursor, &response)
return response, err
}
@ -1107,10 +1112,7 @@ func (api *API) GetUserLikes(user_id UserID, cursor string) (APIV2Response, erro
var response APIV2Response
err = api.do_http(url.String(), cursor, &response)
if err != nil {
panic(err)
}
return response, nil
return response, err
}
type PaginatedUserLikes struct {
@ -1176,10 +1178,7 @@ func (api *API) GetBookmarks(cursor string) (APIV2Response, error) {
var response APIV2Response
err = api.do_http(url.String(), cursor, &response)
if err != nil {
panic(err)
}
return response, nil
return response, err
}
type PaginatedBookmarks struct {
@ -1259,12 +1258,9 @@ func (api *API) GetHomeTimeline(cursor string, is_following_only bool) (TweetTro
panic(err)
}
err = api.do_http_POST(url, string(body_bytes), &response)
if err != nil {
panic(err)
}
trove, err := response.ToTweetTrove()
if err != nil {
return TweetTrove{}, err
trove, err2 := response.ToTweetTrove()
if err2 != nil {
return TweetTrove{}, err2
}
return trove, err
}
@ -1312,11 +1308,7 @@ func (api API) GetUser(handle UserHandle) (APIUser, error) {
var response UserResponse
err = api.do_http(url.String(), "", &response)
if err != nil {
panic(err)
}
return response.ConvertToAPIUser(), nil
return response.ConvertToAPIUser(), err
}
func (api *API) Search(query string, cursor string) (APIV2Response, error) {
@ -1373,3 +1365,13 @@ func (p PaginatedSearch) NextPage(api *API, cursor string) (APIV2Response, error
func (p PaginatedSearch) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
return r.ToTweetTrove()
}
// TODO: Search modes:
// - regular ("top")
// - latest / "live"
// - search for users
// - photos
// - videos
func Search(query string, min_results int) (trove TweetTrove, err error) {
return the_api.GetPaginatedQuery(PaginatedSearch{query}, min_results)
}

View File

@ -1,17 +0,0 @@
package scraper
func TimestampToDateString(timestamp int) string {
panic("???") // TODO
}
/**
* TODO: Search modes:
* - regular ("top")
* - latest / "live"
* - search for users
* - photos
* - videos
*/
func Search(query string, min_results int) (trove TweetTrove, err error) {
return the_api.GetPaginatedQuery(PaginatedSearch{query}, min_results)
}