From 92b166a4ebfc480356ec6a80ebd965385b32a1cb Mon Sep 17 00:00:00 2001 From: Alessio Date: Tue, 26 Dec 2023 15:54:41 -0600 Subject: [PATCH] REFACTOR: create 'GetPaginatedQuery' function to encapsulate queries with cursors --- pkg/scraper/api_errors.go | 10 ++--- pkg/scraper/api_types_v2.go | 78 +++++++++++++++++++++++++------------ pkg/scraper/search.go | 30 +------------- pkg/scraper/tweet.go | 39 ++++--------------- pkg/scraper/user_feed.go | 22 +---------- 5 files changed, 69 insertions(+), 110 deletions(-) diff --git a/pkg/scraper/api_errors.go b/pkg/scraper/api_errors.go index e6692e5..24876a9 100644 --- a/pkg/scraper/api_errors.go +++ b/pkg/scraper/api_errors.go @@ -1,12 +1,12 @@ package scraper import ( - "fmt" + "errors" ) var ( - END_OF_FEED = fmt.Errorf("End of feed") - DOESNT_EXIST = fmt.Errorf("Doesn't exist") - EXTERNAL_API_ERROR = fmt.Errorf("Unexpected result from external API") - API_PARSE_ERROR = fmt.Errorf("Couldn't parse the result returned from the API") + END_OF_FEED = errors.New("End of feed") + ErrDoesntExist = errors.New("Doesn't exist") + EXTERNAL_API_ERROR = errors.New("Unexpected result from external API") + ErrorIsTombstone = errors.New("tweet is a tombstone") ) diff --git a/pkg/scraper/api_types_v2.go b/pkg/scraper/api_types_v2.go index 94b5d95..4c82559 100644 --- a/pkg/scraper/api_types_v2.go +++ b/pkg/scraper/api_types_v2.go @@ -11,9 +11,6 @@ import ( log "github.com/sirupsen/logrus" ) -var ErrorIsTombstone = errors.New("tweet is a tombstone") -var ErrTweetNotFound = errors.New("api responded 'no status found with that ID'") - type CardValue struct { Type string `json:"type"` StringValue string `json:"string_value"` @@ -808,6 +805,7 @@ func (r APIV2Response) ToTweetTroveAsLikes() (TweetTrove, error) { type PaginatedQuery interface { NextPage(api *API, cursor string) (APIV2Response, error) + ToTweetTrove(r APIV2Response) (TweetTrove, error) } func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) error { @@ -820,10 +818,12 @@ func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) e if fresh_response.GetCursorBottom() == last_response.GetCursorBottom() && len(fresh_response.GetMainInstruction().Entries) == 0 { // Empty response, cursor same as previous: end of feed has been reached + fmt.Printf("Cursor repeated; EOF\n") return END_OF_FEED } if fresh_response.IsEmpty() { // Response has a pinned tweet, but no other content: end of feed has been reached + fmt.Printf("No non-pinned-tweet entries; EOF\n") return END_OF_FEED // TODO: check that there actually is a pinned tweet and the request didn't just fail lol } @@ -839,6 +839,31 @@ func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) e return nil } +func (api *API) GetPaginatedQuery(pq PaginatedQuery, count int) (TweetTrove, error) { + fmt.Printf("Paginating %d count\n", count) + api_response, err := pq.NextPage(api, "") + if err != nil { + return TweetTrove{}, fmt.Errorf("Error calling API to fetch query %#v:\n %w", pq, err) + } + if len(api_response.GetMainInstruction().Entries) < count && api_response.GetCursorBottom() != "" { + err = api.GetMore(pq, &api_response, count) + if errors.Is(err, END_OF_FEED) { + println("End of feed!") + } else if err != nil { + return TweetTrove{}, err + } + } + + trove, err := pq.ToTweetTrove(api_response) + if err != nil { + return TweetTrove{}, fmt.Errorf("Error parsing the tweet trove for query %#v:\n %w", pq, err) + } + + fmt.Println("------------") + err = trove.PostProcess() + return trove, err +} + // Get a User feed using the new GraphQL twitter api func (api *API) GetGraphqlFeedFor(user_id UserID, cursor string) (APIV2Response, error) { url, err := url.Parse(GraphqlURL{ @@ -877,6 +902,9 @@ type PaginatedUserFeed struct { func (p PaginatedUserFeed) NextPage(api *API, cursor string) (APIV2Response, error) { return api.GetGraphqlFeedFor(p.user_id, cursor) } +func (p PaginatedUserFeed) ToTweetTrove(r APIV2Response) (TweetTrove, error) { + return r.ToTweetTrove() +} func (api *API) GetTweetDetail(tweet_id TweetID, cursor string) (APIV2Response, error) { url, err := url.Parse(GraphqlURL{ @@ -919,6 +947,12 @@ func (api *API) GetTweetDetail(tweet_id TweetID, cursor string) (APIV2Response, var response APIV2Response err = api.do_http(url.String(), cursor, &response) + if len(response.Errors) != 0 { + if response.Errors[0].Message == "_Missing: No status found with that ID." { + return response, ErrDoesntExist + } + return response, fmt.Errorf("%w: %s", EXTERNAL_API_ERROR, response.Errors[0].Message) + } return response, err } @@ -930,6 +964,9 @@ type PaginatedTweetReplies struct { func (p PaginatedTweetReplies) NextPage(api *API, cursor string) (APIV2Response, error) { return api.GetTweetDetail(p.tweet_id, cursor) } +func (p PaginatedTweetReplies) ToTweetTrove(r APIV2Response) (TweetTrove, error) { + return r.ToTweetTrove() +} func (api *API) GetUserLikes(user_id UserID, cursor string) (APIV2Response, error) { url, err := url.Parse(GraphqlURL{ @@ -984,33 +1021,23 @@ type PaginatedUserLikes struct { func (p PaginatedUserLikes) NextPage(api *API, cursor string) (APIV2Response, error) { return api.GetUserLikes(p.user_id, cursor) } - -func GetUserLikes(user_id UserID, how_many int) (TweetTrove, error) { - response, err := the_api.GetUserLikes(user_id, "") - if err != nil { - return TweetTrove{}, err - } - - if len(response.GetMainInstruction().Entries) < how_many && response.GetCursorBottom() != "" { - err = the_api.GetMore(PaginatedUserLikes{user_id}, &response, how_many) - if errors.Is(err, END_OF_FEED) { - println("End of feed!") - } else if err != nil { - return TweetTrove{}, err - } - } - trove, err := response.ToTweetTroveAsLikes() +func (p PaginatedUserLikes) ToTweetTrove(r APIV2Response) (TweetTrove, error) { + ret, err := r.ToTweetTroveAsLikes() if err != nil { return TweetTrove{}, err } // Fill out the liking UserID - for i := range trove.Likes { - l := trove.Likes[i] - l.UserID = user_id - trove.Likes[i] = l + for i := range ret.Likes { + l := ret.Likes[i] + l.UserID = p.user_id + ret.Likes[i] = l } - return trove, nil + return ret, nil +} + +func GetUserLikes(user_id UserID, how_many int) (TweetTrove, error) { + return the_api.GetPaginatedQuery(PaginatedUserLikes{user_id}, how_many) } func (api *API) GetHomeTimeline(cursor string, is_for_you bool) (TweetTrove, error) { @@ -1175,3 +1202,6 @@ type PaginatedSearch struct { func (p PaginatedSearch) NextPage(api *API, cursor string) (APIV2Response, error) { return api.Search(p.query, cursor) } +func (p PaginatedSearch) ToTweetTrove(r APIV2Response) (TweetTrove, error) { + return r.ToTweetTrove() +} diff --git a/pkg/scraper/search.go b/pkg/scraper/search.go index 5f6b901..901472a 100644 --- a/pkg/scraper/search.go +++ b/pkg/scraper/search.go @@ -1,10 +1,5 @@ package scraper -import ( - "errors" - "fmt" -) - func TimestampToDateString(timestamp int) string { panic("???") // TODO } @@ -18,28 +13,5 @@ func TimestampToDateString(timestamp int) string { * - videos */ func Search(query string, min_results int) (trove TweetTrove, err error) { - api_response, err := the_api.Search(query, "") - if err != nil { - return - } - - if len(api_response.GetMainInstruction().Entries) < min_results && api_response.GetCursorBottom() != "" { - err = the_api.GetMore(PaginatedSearch{query}, &api_response, min_results) - if errors.Is(err, END_OF_FEED) { - println("End of feed!") - } else if err != nil { - return - } - } - - trove, err = api_response.ToTweetTrove() - if err != nil { - err = fmt.Errorf("Error parsing the tweet trove for search query %q:\n %w", query, err) - return - } - - // Filling tombstones and tombstoned users is probably not necessary here, but we still - // need to fetch Spaces - err = trove.PostProcess() - return + return the_api.GetPaginatedQuery(PaginatedSearch{query}, min_results) } diff --git a/pkg/scraper/tweet.go b/pkg/scraper/tweet.go index 74ddbd2..7e18f2f 100644 --- a/pkg/scraper/tweet.go +++ b/pkg/scraper/tweet.go @@ -327,39 +327,16 @@ func GetTweetFull(id TweetID, how_many int) (trove TweetTrove, err error) { return } -func GetTweetFullAPIV2(id TweetID, how_many int) (trove TweetTrove, err error) { - resp, err := the_api.GetTweetDetail(id, "") - if err != nil { - err = fmt.Errorf("Error getting tweet detail: %d\n %w", id, err) - return - } - if len(resp.Errors) != 0 { - if resp.Errors[0].Message == "_Missing: No status found with that ID." { - trove := NewTweetTrove() - trove.Tweets[id] = Tweet{ID: id, TombstoneType: "deleted", IsConversationScraped: true} - return trove, nil - } - panic(fmt.Sprintf("Unknown error: %s", resp.Errors[0].Message)) - } - - err = the_api.GetMore(PaginatedTweetReplies{id}, &resp, how_many) - if err != nil && !errors.Is(err, END_OF_FEED) { - err = fmt.Errorf("Error getting more replies in tweet detail: %d\n %w", id, err) - return - } - trove, err = resp.ToTweetTrove() - if err != nil { +func GetTweetFullAPIV2(id TweetID, how_many int) (TweetTrove, error) { + trove, err := the_api.GetPaginatedQuery(PaginatedTweetReplies{id}, how_many) + if errors.Is(err, ErrDoesntExist) { + trove := NewTweetTrove() + trove.Tweets[id] = Tweet{ID: id, TombstoneType: "deleted", IsConversationScraped: true} + return trove, nil + } else if err != nil { return trove, err } - // Quoted tombstones need their user_id filled out from the tombstoned_users list - log.Debug("Running tweet trove post-processing\n") - err = trove.PostProcess() - if err != nil { - err = fmt.Errorf("Error getting tweet (id %d):\n %w", id, err) - return - } - // Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at" tweet, ok := trove.Tweets[id] if !ok { @@ -369,5 +346,5 @@ func GetTweetFullAPIV2(id TweetID, how_many int) (trove TweetTrove, err error) { tweet.IsConversationScraped = true trove.Tweets[id] = tweet - return + return trove, err } diff --git a/pkg/scraper/user_feed.go b/pkg/scraper/user_feed.go index 6ed6db6..30bc539 100644 --- a/pkg/scraper/user_feed.go +++ b/pkg/scraper/user_feed.go @@ -33,25 +33,5 @@ func GetUserFeedFor(user_id UserID, min_tweets int) (trove TweetTrove, err error } func GetUserFeedGraphqlFor(user_id UserID, min_tweets int) (trove TweetTrove, err error) { - api_response, err := the_api.GetGraphqlFeedFor(user_id, "") - if err != nil { - err = fmt.Errorf("Error calling API to fetch user feed: UserID %d\n %w", user_id, err) - return - } - - if len(api_response.GetMainInstruction().Entries) < min_tweets && api_response.GetCursorBottom() != "" { - err = the_api.GetMore(PaginatedUserFeed{user_id}, &api_response, min_tweets) - if err != nil && !errors.Is(err, END_OF_FEED) { - return - } - } - - trove, err = api_response.ToTweetTrove() - if err != nil { - panic(err) - } - - fmt.Println("------------") - err = trove.PostProcess() - return trove, err + return the_api.GetPaginatedQuery(PaginatedUserFeed{user_id}, min_tweets) }