REFACTOR: create 'GetPaginatedQuery' function to encapsulate queries with cursors

This commit is contained in:
Alessio 2023-12-26 15:54:41 -06:00
parent 4b6686cde6
commit 92b166a4eb
5 changed files with 69 additions and 110 deletions

View File

@ -1,12 +1,12 @@
package scraper package scraper
import ( import (
"fmt" "errors"
) )
var ( var (
END_OF_FEED = fmt.Errorf("End of feed") END_OF_FEED = errors.New("End of feed")
DOESNT_EXIST = fmt.Errorf("Doesn't exist") ErrDoesntExist = errors.New("Doesn't exist")
EXTERNAL_API_ERROR = fmt.Errorf("Unexpected result from external API") EXTERNAL_API_ERROR = errors.New("Unexpected result from external API")
API_PARSE_ERROR = fmt.Errorf("Couldn't parse the result returned from the API") ErrorIsTombstone = errors.New("tweet is a tombstone")
) )

View File

@ -11,9 +11,6 @@ import (
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
var ErrorIsTombstone = errors.New("tweet is a tombstone")
var ErrTweetNotFound = errors.New("api responded 'no status found with that ID'")
type CardValue struct { type CardValue struct {
Type string `json:"type"` Type string `json:"type"`
StringValue string `json:"string_value"` StringValue string `json:"string_value"`
@ -808,6 +805,7 @@ func (r APIV2Response) ToTweetTroveAsLikes() (TweetTrove, error) {
type PaginatedQuery interface { type PaginatedQuery interface {
NextPage(api *API, cursor string) (APIV2Response, error) NextPage(api *API, cursor string) (APIV2Response, error)
ToTweetTrove(r APIV2Response) (TweetTrove, error)
} }
func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) error { func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) error {
@ -820,10 +818,12 @@ func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) e
if fresh_response.GetCursorBottom() == last_response.GetCursorBottom() && len(fresh_response.GetMainInstruction().Entries) == 0 { if fresh_response.GetCursorBottom() == last_response.GetCursorBottom() && len(fresh_response.GetMainInstruction().Entries) == 0 {
// Empty response, cursor same as previous: end of feed has been reached // Empty response, cursor same as previous: end of feed has been reached
fmt.Printf("Cursor repeated; EOF\n")
return END_OF_FEED return END_OF_FEED
} }
if fresh_response.IsEmpty() { if fresh_response.IsEmpty() {
// Response has a pinned tweet, but no other content: end of feed has been reached // Response has a pinned tweet, but no other content: end of feed has been reached
fmt.Printf("No non-pinned-tweet entries; EOF\n")
return END_OF_FEED // TODO: check that there actually is a pinned tweet and the request didn't just fail lol return END_OF_FEED // TODO: check that there actually is a pinned tweet and the request didn't just fail lol
} }
@ -839,6 +839,31 @@ func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) e
return nil return nil
} }
func (api *API) GetPaginatedQuery(pq PaginatedQuery, count int) (TweetTrove, error) {
fmt.Printf("Paginating %d count\n", count)
api_response, err := pq.NextPage(api, "")
if err != nil {
return TweetTrove{}, fmt.Errorf("Error calling API to fetch query %#v:\n %w", pq, err)
}
if len(api_response.GetMainInstruction().Entries) < count && api_response.GetCursorBottom() != "" {
err = api.GetMore(pq, &api_response, count)
if errors.Is(err, END_OF_FEED) {
println("End of feed!")
} else if err != nil {
return TweetTrove{}, err
}
}
trove, err := pq.ToTweetTrove(api_response)
if err != nil {
return TweetTrove{}, fmt.Errorf("Error parsing the tweet trove for query %#v:\n %w", pq, err)
}
fmt.Println("------------")
err = trove.PostProcess()
return trove, err
}
// Get a User feed using the new GraphQL twitter api // Get a User feed using the new GraphQL twitter api
func (api *API) GetGraphqlFeedFor(user_id UserID, cursor string) (APIV2Response, error) { func (api *API) GetGraphqlFeedFor(user_id UserID, cursor string) (APIV2Response, error) {
url, err := url.Parse(GraphqlURL{ url, err := url.Parse(GraphqlURL{
@ -877,6 +902,9 @@ type PaginatedUserFeed struct {
func (p PaginatedUserFeed) NextPage(api *API, cursor string) (APIV2Response, error) { func (p PaginatedUserFeed) NextPage(api *API, cursor string) (APIV2Response, error) {
return api.GetGraphqlFeedFor(p.user_id, cursor) return api.GetGraphqlFeedFor(p.user_id, cursor)
} }
func (p PaginatedUserFeed) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
return r.ToTweetTrove()
}
func (api *API) GetTweetDetail(tweet_id TweetID, cursor string) (APIV2Response, error) { func (api *API) GetTweetDetail(tweet_id TweetID, cursor string) (APIV2Response, error) {
url, err := url.Parse(GraphqlURL{ url, err := url.Parse(GraphqlURL{
@ -919,6 +947,12 @@ func (api *API) GetTweetDetail(tweet_id TweetID, cursor string) (APIV2Response,
var response APIV2Response var response APIV2Response
err = api.do_http(url.String(), cursor, &response) err = api.do_http(url.String(), cursor, &response)
if len(response.Errors) != 0 {
if response.Errors[0].Message == "_Missing: No status found with that ID." {
return response, ErrDoesntExist
}
return response, fmt.Errorf("%w: %s", EXTERNAL_API_ERROR, response.Errors[0].Message)
}
return response, err return response, err
} }
@ -930,6 +964,9 @@ type PaginatedTweetReplies struct {
func (p PaginatedTweetReplies) NextPage(api *API, cursor string) (APIV2Response, error) { func (p PaginatedTweetReplies) NextPage(api *API, cursor string) (APIV2Response, error) {
return api.GetTweetDetail(p.tweet_id, cursor) return api.GetTweetDetail(p.tweet_id, cursor)
} }
func (p PaginatedTweetReplies) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
return r.ToTweetTrove()
}
func (api *API) GetUserLikes(user_id UserID, cursor string) (APIV2Response, error) { func (api *API) GetUserLikes(user_id UserID, cursor string) (APIV2Response, error) {
url, err := url.Parse(GraphqlURL{ url, err := url.Parse(GraphqlURL{
@ -984,33 +1021,23 @@ type PaginatedUserLikes struct {
func (p PaginatedUserLikes) NextPage(api *API, cursor string) (APIV2Response, error) { func (p PaginatedUserLikes) NextPage(api *API, cursor string) (APIV2Response, error) {
return api.GetUserLikes(p.user_id, cursor) return api.GetUserLikes(p.user_id, cursor)
} }
func (p PaginatedUserLikes) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
func GetUserLikes(user_id UserID, how_many int) (TweetTrove, error) { ret, err := r.ToTweetTroveAsLikes()
response, err := the_api.GetUserLikes(user_id, "")
if err != nil {
return TweetTrove{}, err
}
if len(response.GetMainInstruction().Entries) < how_many && response.GetCursorBottom() != "" {
err = the_api.GetMore(PaginatedUserLikes{user_id}, &response, how_many)
if errors.Is(err, END_OF_FEED) {
println("End of feed!")
} else if err != nil {
return TweetTrove{}, err
}
}
trove, err := response.ToTweetTroveAsLikes()
if err != nil { if err != nil {
return TweetTrove{}, err return TweetTrove{}, err
} }
// Fill out the liking UserID // Fill out the liking UserID
for i := range trove.Likes { for i := range ret.Likes {
l := trove.Likes[i] l := ret.Likes[i]
l.UserID = user_id l.UserID = p.user_id
trove.Likes[i] = l ret.Likes[i] = l
} }
return trove, nil return ret, nil
}
func GetUserLikes(user_id UserID, how_many int) (TweetTrove, error) {
return the_api.GetPaginatedQuery(PaginatedUserLikes{user_id}, how_many)
} }
func (api *API) GetHomeTimeline(cursor string, is_for_you bool) (TweetTrove, error) { func (api *API) GetHomeTimeline(cursor string, is_for_you bool) (TweetTrove, error) {
@ -1175,3 +1202,6 @@ type PaginatedSearch struct {
func (p PaginatedSearch) NextPage(api *API, cursor string) (APIV2Response, error) { func (p PaginatedSearch) NextPage(api *API, cursor string) (APIV2Response, error) {
return api.Search(p.query, cursor) return api.Search(p.query, cursor)
} }
func (p PaginatedSearch) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
return r.ToTweetTrove()
}

View File

@ -1,10 +1,5 @@
package scraper package scraper
import (
"errors"
"fmt"
)
func TimestampToDateString(timestamp int) string { func TimestampToDateString(timestamp int) string {
panic("???") // TODO panic("???") // TODO
} }
@ -18,28 +13,5 @@ func TimestampToDateString(timestamp int) string {
* - videos * - videos
*/ */
func Search(query string, min_results int) (trove TweetTrove, err error) { func Search(query string, min_results int) (trove TweetTrove, err error) {
api_response, err := the_api.Search(query, "") return the_api.GetPaginatedQuery(PaginatedSearch{query}, min_results)
if err != nil {
return
}
if len(api_response.GetMainInstruction().Entries) < min_results && api_response.GetCursorBottom() != "" {
err = the_api.GetMore(PaginatedSearch{query}, &api_response, min_results)
if errors.Is(err, END_OF_FEED) {
println("End of feed!")
} else if err != nil {
return
}
}
trove, err = api_response.ToTweetTrove()
if err != nil {
err = fmt.Errorf("Error parsing the tweet trove for search query %q:\n %w", query, err)
return
}
// Filling tombstones and tombstoned users is probably not necessary here, but we still
// need to fetch Spaces
err = trove.PostProcess()
return
} }

View File

@ -327,39 +327,16 @@ func GetTweetFull(id TweetID, how_many int) (trove TweetTrove, err error) {
return return
} }
func GetTweetFullAPIV2(id TweetID, how_many int) (trove TweetTrove, err error) { func GetTweetFullAPIV2(id TweetID, how_many int) (TweetTrove, error) {
resp, err := the_api.GetTweetDetail(id, "") trove, err := the_api.GetPaginatedQuery(PaginatedTweetReplies{id}, how_many)
if err != nil { if errors.Is(err, ErrDoesntExist) {
err = fmt.Errorf("Error getting tweet detail: %d\n %w", id, err) trove := NewTweetTrove()
return trove.Tweets[id] = Tweet{ID: id, TombstoneType: "deleted", IsConversationScraped: true}
} return trove, nil
if len(resp.Errors) != 0 { } else if err != nil {
if resp.Errors[0].Message == "_Missing: No status found with that ID." {
trove := NewTweetTrove()
trove.Tweets[id] = Tweet{ID: id, TombstoneType: "deleted", IsConversationScraped: true}
return trove, nil
}
panic(fmt.Sprintf("Unknown error: %s", resp.Errors[0].Message))
}
err = the_api.GetMore(PaginatedTweetReplies{id}, &resp, how_many)
if err != nil && !errors.Is(err, END_OF_FEED) {
err = fmt.Errorf("Error getting more replies in tweet detail: %d\n %w", id, err)
return
}
trove, err = resp.ToTweetTrove()
if err != nil {
return trove, err return trove, err
} }
// Quoted tombstones need their user_id filled out from the tombstoned_users list
log.Debug("Running tweet trove post-processing\n")
err = trove.PostProcess()
if err != nil {
err = fmt.Errorf("Error getting tweet (id %d):\n %w", id, err)
return
}
// Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at" // Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at"
tweet, ok := trove.Tweets[id] tweet, ok := trove.Tweets[id]
if !ok { if !ok {
@ -369,5 +346,5 @@ func GetTweetFullAPIV2(id TweetID, how_many int) (trove TweetTrove, err error) {
tweet.IsConversationScraped = true tweet.IsConversationScraped = true
trove.Tweets[id] = tweet trove.Tweets[id] = tweet
return return trove, err
} }

View File

@ -33,25 +33,5 @@ func GetUserFeedFor(user_id UserID, min_tweets int) (trove TweetTrove, err error
} }
func GetUserFeedGraphqlFor(user_id UserID, min_tweets int) (trove TweetTrove, err error) { func GetUserFeedGraphqlFor(user_id UserID, min_tweets int) (trove TweetTrove, err error) {
api_response, err := the_api.GetGraphqlFeedFor(user_id, "") return the_api.GetPaginatedQuery(PaginatedUserFeed{user_id}, min_tweets)
if err != nil {
err = fmt.Errorf("Error calling API to fetch user feed: UserID %d\n %w", user_id, err)
return
}
if len(api_response.GetMainInstruction().Entries) < min_tweets && api_response.GetCursorBottom() != "" {
err = the_api.GetMore(PaginatedUserFeed{user_id}, &api_response, min_tweets)
if err != nil && !errors.Is(err, END_OF_FEED) {
return
}
}
trove, err = api_response.ToTweetTrove()
if err != nil {
panic(err)
}
fmt.Println("------------")
err = trove.PostProcess()
return trove, err
} }