REFACTOR: create 'GetPaginatedQuery' function to encapsulate queries with cursors

This commit is contained in:
Alessio 2023-12-26 15:54:41 -06:00
parent 4b6686cde6
commit 92b166a4eb
5 changed files with 69 additions and 110 deletions

View File

@ -1,12 +1,12 @@
package scraper
import (
"fmt"
"errors"
)
var (
END_OF_FEED = fmt.Errorf("End of feed")
DOESNT_EXIST = fmt.Errorf("Doesn't exist")
EXTERNAL_API_ERROR = fmt.Errorf("Unexpected result from external API")
API_PARSE_ERROR = fmt.Errorf("Couldn't parse the result returned from the API")
END_OF_FEED = errors.New("End of feed")
ErrDoesntExist = errors.New("Doesn't exist")
EXTERNAL_API_ERROR = errors.New("Unexpected result from external API")
ErrorIsTombstone = errors.New("tweet is a tombstone")
)

View File

@ -11,9 +11,6 @@ import (
log "github.com/sirupsen/logrus"
)
var ErrorIsTombstone = errors.New("tweet is a tombstone")
var ErrTweetNotFound = errors.New("api responded 'no status found with that ID'")
type CardValue struct {
Type string `json:"type"`
StringValue string `json:"string_value"`
@ -808,6 +805,7 @@ func (r APIV2Response) ToTweetTroveAsLikes() (TweetTrove, error) {
type PaginatedQuery interface {
NextPage(api *API, cursor string) (APIV2Response, error)
ToTweetTrove(r APIV2Response) (TweetTrove, error)
}
func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) error {
@ -820,10 +818,12 @@ func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) e
if fresh_response.GetCursorBottom() == last_response.GetCursorBottom() && len(fresh_response.GetMainInstruction().Entries) == 0 {
// Empty response, cursor same as previous: end of feed has been reached
fmt.Printf("Cursor repeated; EOF\n")
return END_OF_FEED
}
if fresh_response.IsEmpty() {
// Response has a pinned tweet, but no other content: end of feed has been reached
fmt.Printf("No non-pinned-tweet entries; EOF\n")
return END_OF_FEED // TODO: check that there actually is a pinned tweet and the request didn't just fail lol
}
@ -839,6 +839,31 @@ func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) e
return nil
}
func (api *API) GetPaginatedQuery(pq PaginatedQuery, count int) (TweetTrove, error) {
fmt.Printf("Paginating %d count\n", count)
api_response, err := pq.NextPage(api, "")
if err != nil {
return TweetTrove{}, fmt.Errorf("Error calling API to fetch query %#v:\n %w", pq, err)
}
if len(api_response.GetMainInstruction().Entries) < count && api_response.GetCursorBottom() != "" {
err = api.GetMore(pq, &api_response, count)
if errors.Is(err, END_OF_FEED) {
println("End of feed!")
} else if err != nil {
return TweetTrove{}, err
}
}
trove, err := pq.ToTweetTrove(api_response)
if err != nil {
return TweetTrove{}, fmt.Errorf("Error parsing the tweet trove for query %#v:\n %w", pq, err)
}
fmt.Println("------------")
err = trove.PostProcess()
return trove, err
}
// Get a User feed using the new GraphQL twitter api
func (api *API) GetGraphqlFeedFor(user_id UserID, cursor string) (APIV2Response, error) {
url, err := url.Parse(GraphqlURL{
@ -877,6 +902,9 @@ type PaginatedUserFeed struct {
func (p PaginatedUserFeed) NextPage(api *API, cursor string) (APIV2Response, error) {
return api.GetGraphqlFeedFor(p.user_id, cursor)
}
func (p PaginatedUserFeed) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
return r.ToTweetTrove()
}
func (api *API) GetTweetDetail(tweet_id TweetID, cursor string) (APIV2Response, error) {
url, err := url.Parse(GraphqlURL{
@ -919,6 +947,12 @@ func (api *API) GetTweetDetail(tweet_id TweetID, cursor string) (APIV2Response,
var response APIV2Response
err = api.do_http(url.String(), cursor, &response)
if len(response.Errors) != 0 {
if response.Errors[0].Message == "_Missing: No status found with that ID." {
return response, ErrDoesntExist
}
return response, fmt.Errorf("%w: %s", EXTERNAL_API_ERROR, response.Errors[0].Message)
}
return response, err
}
@ -930,6 +964,9 @@ type PaginatedTweetReplies struct {
func (p PaginatedTweetReplies) NextPage(api *API, cursor string) (APIV2Response, error) {
return api.GetTweetDetail(p.tweet_id, cursor)
}
func (p PaginatedTweetReplies) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
return r.ToTweetTrove()
}
func (api *API) GetUserLikes(user_id UserID, cursor string) (APIV2Response, error) {
url, err := url.Parse(GraphqlURL{
@ -984,33 +1021,23 @@ type PaginatedUserLikes struct {
func (p PaginatedUserLikes) NextPage(api *API, cursor string) (APIV2Response, error) {
return api.GetUserLikes(p.user_id, cursor)
}
func GetUserLikes(user_id UserID, how_many int) (TweetTrove, error) {
response, err := the_api.GetUserLikes(user_id, "")
if err != nil {
return TweetTrove{}, err
}
if len(response.GetMainInstruction().Entries) < how_many && response.GetCursorBottom() != "" {
err = the_api.GetMore(PaginatedUserLikes{user_id}, &response, how_many)
if errors.Is(err, END_OF_FEED) {
println("End of feed!")
} else if err != nil {
return TweetTrove{}, err
}
}
trove, err := response.ToTweetTroveAsLikes()
func (p PaginatedUserLikes) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
ret, err := r.ToTweetTroveAsLikes()
if err != nil {
return TweetTrove{}, err
}
// Fill out the liking UserID
for i := range trove.Likes {
l := trove.Likes[i]
l.UserID = user_id
trove.Likes[i] = l
for i := range ret.Likes {
l := ret.Likes[i]
l.UserID = p.user_id
ret.Likes[i] = l
}
return trove, nil
return ret, nil
}
func GetUserLikes(user_id UserID, how_many int) (TweetTrove, error) {
return the_api.GetPaginatedQuery(PaginatedUserLikes{user_id}, how_many)
}
func (api *API) GetHomeTimeline(cursor string, is_for_you bool) (TweetTrove, error) {
@ -1175,3 +1202,6 @@ type PaginatedSearch struct {
func (p PaginatedSearch) NextPage(api *API, cursor string) (APIV2Response, error) {
return api.Search(p.query, cursor)
}
func (p PaginatedSearch) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
return r.ToTweetTrove()
}

View File

@ -1,10 +1,5 @@
package scraper
import (
"errors"
"fmt"
)
func TimestampToDateString(timestamp int) string {
panic("???") // TODO
}
@ -18,28 +13,5 @@ func TimestampToDateString(timestamp int) string {
* - videos
*/
func Search(query string, min_results int) (trove TweetTrove, err error) {
api_response, err := the_api.Search(query, "")
if err != nil {
return
}
if len(api_response.GetMainInstruction().Entries) < min_results && api_response.GetCursorBottom() != "" {
err = the_api.GetMore(PaginatedSearch{query}, &api_response, min_results)
if errors.Is(err, END_OF_FEED) {
println("End of feed!")
} else if err != nil {
return
}
}
trove, err = api_response.ToTweetTrove()
if err != nil {
err = fmt.Errorf("Error parsing the tweet trove for search query %q:\n %w", query, err)
return
}
// Filling tombstones and tombstoned users is probably not necessary here, but we still
// need to fetch Spaces
err = trove.PostProcess()
return
return the_api.GetPaginatedQuery(PaginatedSearch{query}, min_results)
}

View File

@ -327,39 +327,16 @@ func GetTweetFull(id TweetID, how_many int) (trove TweetTrove, err error) {
return
}
func GetTweetFullAPIV2(id TweetID, how_many int) (trove TweetTrove, err error) {
resp, err := the_api.GetTweetDetail(id, "")
if err != nil {
err = fmt.Errorf("Error getting tweet detail: %d\n %w", id, err)
return
}
if len(resp.Errors) != 0 {
if resp.Errors[0].Message == "_Missing: No status found with that ID." {
func GetTweetFullAPIV2(id TweetID, how_many int) (TweetTrove, error) {
trove, err := the_api.GetPaginatedQuery(PaginatedTweetReplies{id}, how_many)
if errors.Is(err, ErrDoesntExist) {
trove := NewTweetTrove()
trove.Tweets[id] = Tweet{ID: id, TombstoneType: "deleted", IsConversationScraped: true}
return trove, nil
}
panic(fmt.Sprintf("Unknown error: %s", resp.Errors[0].Message))
}
err = the_api.GetMore(PaginatedTweetReplies{id}, &resp, how_many)
if err != nil && !errors.Is(err, END_OF_FEED) {
err = fmt.Errorf("Error getting more replies in tweet detail: %d\n %w", id, err)
return
}
trove, err = resp.ToTweetTrove()
if err != nil {
} else if err != nil {
return trove, err
}
// Quoted tombstones need their user_id filled out from the tombstoned_users list
log.Debug("Running tweet trove post-processing\n")
err = trove.PostProcess()
if err != nil {
err = fmt.Errorf("Error getting tweet (id %d):\n %w", id, err)
return
}
// Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at"
tweet, ok := trove.Tweets[id]
if !ok {
@ -369,5 +346,5 @@ func GetTweetFullAPIV2(id TweetID, how_many int) (trove TweetTrove, err error) {
tweet.IsConversationScraped = true
trove.Tweets[id] = tweet
return
return trove, err
}

View File

@ -33,25 +33,5 @@ func GetUserFeedFor(user_id UserID, min_tweets int) (trove TweetTrove, err error
}
func GetUserFeedGraphqlFor(user_id UserID, min_tweets int) (trove TweetTrove, err error) {
api_response, err := the_api.GetGraphqlFeedFor(user_id, "")
if err != nil {
err = fmt.Errorf("Error calling API to fetch user feed: UserID %d\n %w", user_id, err)
return
}
if len(api_response.GetMainInstruction().Entries) < min_tweets && api_response.GetCursorBottom() != "" {
err = the_api.GetMore(PaginatedUserFeed{user_id}, &api_response, min_tweets)
if err != nil && !errors.Is(err, END_OF_FEED) {
return
}
}
trove, err = api_response.ToTweetTrove()
if err != nil {
panic(err)
}
fmt.Println("------------")
err = trove.PostProcess()
return trove, err
return the_api.GetPaginatedQuery(PaginatedUserFeed{user_id}, min_tweets)
}