REFACTOR: create 'GetPaginatedQuery' function to encapsulate queries with cursors
This commit is contained in:
parent
4b6686cde6
commit
92b166a4eb
@ -1,12 +1,12 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"errors"
|
||||
)
|
||||
|
||||
var (
|
||||
END_OF_FEED = fmt.Errorf("End of feed")
|
||||
DOESNT_EXIST = fmt.Errorf("Doesn't exist")
|
||||
EXTERNAL_API_ERROR = fmt.Errorf("Unexpected result from external API")
|
||||
API_PARSE_ERROR = fmt.Errorf("Couldn't parse the result returned from the API")
|
||||
END_OF_FEED = errors.New("End of feed")
|
||||
ErrDoesntExist = errors.New("Doesn't exist")
|
||||
EXTERNAL_API_ERROR = errors.New("Unexpected result from external API")
|
||||
ErrorIsTombstone = errors.New("tweet is a tombstone")
|
||||
)
|
||||
|
@ -11,9 +11,6 @@ import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
var ErrorIsTombstone = errors.New("tweet is a tombstone")
|
||||
var ErrTweetNotFound = errors.New("api responded 'no status found with that ID'")
|
||||
|
||||
type CardValue struct {
|
||||
Type string `json:"type"`
|
||||
StringValue string `json:"string_value"`
|
||||
@ -808,6 +805,7 @@ func (r APIV2Response) ToTweetTroveAsLikes() (TweetTrove, error) {
|
||||
|
||||
type PaginatedQuery interface {
|
||||
NextPage(api *API, cursor string) (APIV2Response, error)
|
||||
ToTweetTrove(r APIV2Response) (TweetTrove, error)
|
||||
}
|
||||
|
||||
func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) error {
|
||||
@ -820,10 +818,12 @@ func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) e
|
||||
|
||||
if fresh_response.GetCursorBottom() == last_response.GetCursorBottom() && len(fresh_response.GetMainInstruction().Entries) == 0 {
|
||||
// Empty response, cursor same as previous: end of feed has been reached
|
||||
fmt.Printf("Cursor repeated; EOF\n")
|
||||
return END_OF_FEED
|
||||
}
|
||||
if fresh_response.IsEmpty() {
|
||||
// Response has a pinned tweet, but no other content: end of feed has been reached
|
||||
fmt.Printf("No non-pinned-tweet entries; EOF\n")
|
||||
return END_OF_FEED // TODO: check that there actually is a pinned tweet and the request didn't just fail lol
|
||||
}
|
||||
|
||||
@ -839,6 +839,31 @@ func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) e
|
||||
return nil
|
||||
}
|
||||
|
||||
func (api *API) GetPaginatedQuery(pq PaginatedQuery, count int) (TweetTrove, error) {
|
||||
fmt.Printf("Paginating %d count\n", count)
|
||||
api_response, err := pq.NextPage(api, "")
|
||||
if err != nil {
|
||||
return TweetTrove{}, fmt.Errorf("Error calling API to fetch query %#v:\n %w", pq, err)
|
||||
}
|
||||
if len(api_response.GetMainInstruction().Entries) < count && api_response.GetCursorBottom() != "" {
|
||||
err = api.GetMore(pq, &api_response, count)
|
||||
if errors.Is(err, END_OF_FEED) {
|
||||
println("End of feed!")
|
||||
} else if err != nil {
|
||||
return TweetTrove{}, err
|
||||
}
|
||||
}
|
||||
|
||||
trove, err := pq.ToTweetTrove(api_response)
|
||||
if err != nil {
|
||||
return TweetTrove{}, fmt.Errorf("Error parsing the tweet trove for query %#v:\n %w", pq, err)
|
||||
}
|
||||
|
||||
fmt.Println("------------")
|
||||
err = trove.PostProcess()
|
||||
return trove, err
|
||||
}
|
||||
|
||||
// Get a User feed using the new GraphQL twitter api
|
||||
func (api *API) GetGraphqlFeedFor(user_id UserID, cursor string) (APIV2Response, error) {
|
||||
url, err := url.Parse(GraphqlURL{
|
||||
@ -877,6 +902,9 @@ type PaginatedUserFeed struct {
|
||||
func (p PaginatedUserFeed) NextPage(api *API, cursor string) (APIV2Response, error) {
|
||||
return api.GetGraphqlFeedFor(p.user_id, cursor)
|
||||
}
|
||||
func (p PaginatedUserFeed) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
|
||||
return r.ToTweetTrove()
|
||||
}
|
||||
|
||||
func (api *API) GetTweetDetail(tweet_id TweetID, cursor string) (APIV2Response, error) {
|
||||
url, err := url.Parse(GraphqlURL{
|
||||
@ -919,6 +947,12 @@ func (api *API) GetTweetDetail(tweet_id TweetID, cursor string) (APIV2Response,
|
||||
|
||||
var response APIV2Response
|
||||
err = api.do_http(url.String(), cursor, &response)
|
||||
if len(response.Errors) != 0 {
|
||||
if response.Errors[0].Message == "_Missing: No status found with that ID." {
|
||||
return response, ErrDoesntExist
|
||||
}
|
||||
return response, fmt.Errorf("%w: %s", EXTERNAL_API_ERROR, response.Errors[0].Message)
|
||||
}
|
||||
|
||||
return response, err
|
||||
}
|
||||
@ -930,6 +964,9 @@ type PaginatedTweetReplies struct {
|
||||
func (p PaginatedTweetReplies) NextPage(api *API, cursor string) (APIV2Response, error) {
|
||||
return api.GetTweetDetail(p.tweet_id, cursor)
|
||||
}
|
||||
func (p PaginatedTweetReplies) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
|
||||
return r.ToTweetTrove()
|
||||
}
|
||||
|
||||
func (api *API) GetUserLikes(user_id UserID, cursor string) (APIV2Response, error) {
|
||||
url, err := url.Parse(GraphqlURL{
|
||||
@ -984,33 +1021,23 @@ type PaginatedUserLikes struct {
|
||||
func (p PaginatedUserLikes) NextPage(api *API, cursor string) (APIV2Response, error) {
|
||||
return api.GetUserLikes(p.user_id, cursor)
|
||||
}
|
||||
|
||||
func GetUserLikes(user_id UserID, how_many int) (TweetTrove, error) {
|
||||
response, err := the_api.GetUserLikes(user_id, "")
|
||||
if err != nil {
|
||||
return TweetTrove{}, err
|
||||
}
|
||||
|
||||
if len(response.GetMainInstruction().Entries) < how_many && response.GetCursorBottom() != "" {
|
||||
err = the_api.GetMore(PaginatedUserLikes{user_id}, &response, how_many)
|
||||
if errors.Is(err, END_OF_FEED) {
|
||||
println("End of feed!")
|
||||
} else if err != nil {
|
||||
return TweetTrove{}, err
|
||||
}
|
||||
}
|
||||
trove, err := response.ToTweetTroveAsLikes()
|
||||
func (p PaginatedUserLikes) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
|
||||
ret, err := r.ToTweetTroveAsLikes()
|
||||
if err != nil {
|
||||
return TweetTrove{}, err
|
||||
}
|
||||
|
||||
// Fill out the liking UserID
|
||||
for i := range trove.Likes {
|
||||
l := trove.Likes[i]
|
||||
l.UserID = user_id
|
||||
trove.Likes[i] = l
|
||||
for i := range ret.Likes {
|
||||
l := ret.Likes[i]
|
||||
l.UserID = p.user_id
|
||||
ret.Likes[i] = l
|
||||
}
|
||||
return trove, nil
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
func GetUserLikes(user_id UserID, how_many int) (TweetTrove, error) {
|
||||
return the_api.GetPaginatedQuery(PaginatedUserLikes{user_id}, how_many)
|
||||
}
|
||||
|
||||
func (api *API) GetHomeTimeline(cursor string, is_for_you bool) (TweetTrove, error) {
|
||||
@ -1175,3 +1202,6 @@ type PaginatedSearch struct {
|
||||
func (p PaginatedSearch) NextPage(api *API, cursor string) (APIV2Response, error) {
|
||||
return api.Search(p.query, cursor)
|
||||
}
|
||||
func (p PaginatedSearch) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
|
||||
return r.ToTweetTrove()
|
||||
}
|
||||
|
@ -1,10 +1,5 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
func TimestampToDateString(timestamp int) string {
|
||||
panic("???") // TODO
|
||||
}
|
||||
@ -18,28 +13,5 @@ func TimestampToDateString(timestamp int) string {
|
||||
* - videos
|
||||
*/
|
||||
func Search(query string, min_results int) (trove TweetTrove, err error) {
|
||||
api_response, err := the_api.Search(query, "")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
if len(api_response.GetMainInstruction().Entries) < min_results && api_response.GetCursorBottom() != "" {
|
||||
err = the_api.GetMore(PaginatedSearch{query}, &api_response, min_results)
|
||||
if errors.Is(err, END_OF_FEED) {
|
||||
println("End of feed!")
|
||||
} else if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
trove, err = api_response.ToTweetTrove()
|
||||
if err != nil {
|
||||
err = fmt.Errorf("Error parsing the tweet trove for search query %q:\n %w", query, err)
|
||||
return
|
||||
}
|
||||
|
||||
// Filling tombstones and tombstoned users is probably not necessary here, but we still
|
||||
// need to fetch Spaces
|
||||
err = trove.PostProcess()
|
||||
return
|
||||
return the_api.GetPaginatedQuery(PaginatedSearch{query}, min_results)
|
||||
}
|
||||
|
@ -327,39 +327,16 @@ func GetTweetFull(id TweetID, how_many int) (trove TweetTrove, err error) {
|
||||
return
|
||||
}
|
||||
|
||||
func GetTweetFullAPIV2(id TweetID, how_many int) (trove TweetTrove, err error) {
|
||||
resp, err := the_api.GetTweetDetail(id, "")
|
||||
if err != nil {
|
||||
err = fmt.Errorf("Error getting tweet detail: %d\n %w", id, err)
|
||||
return
|
||||
}
|
||||
if len(resp.Errors) != 0 {
|
||||
if resp.Errors[0].Message == "_Missing: No status found with that ID." {
|
||||
func GetTweetFullAPIV2(id TweetID, how_many int) (TweetTrove, error) {
|
||||
trove, err := the_api.GetPaginatedQuery(PaginatedTweetReplies{id}, how_many)
|
||||
if errors.Is(err, ErrDoesntExist) {
|
||||
trove := NewTweetTrove()
|
||||
trove.Tweets[id] = Tweet{ID: id, TombstoneType: "deleted", IsConversationScraped: true}
|
||||
return trove, nil
|
||||
}
|
||||
panic(fmt.Sprintf("Unknown error: %s", resp.Errors[0].Message))
|
||||
}
|
||||
|
||||
err = the_api.GetMore(PaginatedTweetReplies{id}, &resp, how_many)
|
||||
if err != nil && !errors.Is(err, END_OF_FEED) {
|
||||
err = fmt.Errorf("Error getting more replies in tweet detail: %d\n %w", id, err)
|
||||
return
|
||||
}
|
||||
trove, err = resp.ToTweetTrove()
|
||||
if err != nil {
|
||||
} else if err != nil {
|
||||
return trove, err
|
||||
}
|
||||
|
||||
// Quoted tombstones need their user_id filled out from the tombstoned_users list
|
||||
log.Debug("Running tweet trove post-processing\n")
|
||||
err = trove.PostProcess()
|
||||
if err != nil {
|
||||
err = fmt.Errorf("Error getting tweet (id %d):\n %w", id, err)
|
||||
return
|
||||
}
|
||||
|
||||
// Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at"
|
||||
tweet, ok := trove.Tweets[id]
|
||||
if !ok {
|
||||
@ -369,5 +346,5 @@ func GetTweetFullAPIV2(id TweetID, how_many int) (trove TweetTrove, err error) {
|
||||
tweet.IsConversationScraped = true
|
||||
trove.Tweets[id] = tweet
|
||||
|
||||
return
|
||||
return trove, err
|
||||
}
|
||||
|
@ -33,25 +33,5 @@ func GetUserFeedFor(user_id UserID, min_tweets int) (trove TweetTrove, err error
|
||||
}
|
||||
|
||||
func GetUserFeedGraphqlFor(user_id UserID, min_tweets int) (trove TweetTrove, err error) {
|
||||
api_response, err := the_api.GetGraphqlFeedFor(user_id, "")
|
||||
if err != nil {
|
||||
err = fmt.Errorf("Error calling API to fetch user feed: UserID %d\n %w", user_id, err)
|
||||
return
|
||||
}
|
||||
|
||||
if len(api_response.GetMainInstruction().Entries) < min_tweets && api_response.GetCursorBottom() != "" {
|
||||
err = the_api.GetMore(PaginatedUserFeed{user_id}, &api_response, min_tweets)
|
||||
if err != nil && !errors.Is(err, END_OF_FEED) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
trove, err = api_response.ToTweetTrove()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
fmt.Println("------------")
|
||||
err = trove.PostProcess()
|
||||
return trove, err
|
||||
return the_api.GetPaginatedQuery(PaginatedUserFeed{user_id}, min_tweets)
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user