REFACTOR: create 'GetPaginatedQuery' function to encapsulate queries with cursors
This commit is contained in:
parent
4b6686cde6
commit
92b166a4eb
@ -1,12 +1,12 @@
|
|||||||
package scraper
|
package scraper
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"errors"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
END_OF_FEED = fmt.Errorf("End of feed")
|
END_OF_FEED = errors.New("End of feed")
|
||||||
DOESNT_EXIST = fmt.Errorf("Doesn't exist")
|
ErrDoesntExist = errors.New("Doesn't exist")
|
||||||
EXTERNAL_API_ERROR = fmt.Errorf("Unexpected result from external API")
|
EXTERNAL_API_ERROR = errors.New("Unexpected result from external API")
|
||||||
API_PARSE_ERROR = fmt.Errorf("Couldn't parse the result returned from the API")
|
ErrorIsTombstone = errors.New("tweet is a tombstone")
|
||||||
)
|
)
|
||||||
|
@ -11,9 +11,6 @@ import (
|
|||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
var ErrorIsTombstone = errors.New("tweet is a tombstone")
|
|
||||||
var ErrTweetNotFound = errors.New("api responded 'no status found with that ID'")
|
|
||||||
|
|
||||||
type CardValue struct {
|
type CardValue struct {
|
||||||
Type string `json:"type"`
|
Type string `json:"type"`
|
||||||
StringValue string `json:"string_value"`
|
StringValue string `json:"string_value"`
|
||||||
@ -808,6 +805,7 @@ func (r APIV2Response) ToTweetTroveAsLikes() (TweetTrove, error) {
|
|||||||
|
|
||||||
type PaginatedQuery interface {
|
type PaginatedQuery interface {
|
||||||
NextPage(api *API, cursor string) (APIV2Response, error)
|
NextPage(api *API, cursor string) (APIV2Response, error)
|
||||||
|
ToTweetTrove(r APIV2Response) (TweetTrove, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) error {
|
func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) error {
|
||||||
@ -820,10 +818,12 @@ func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) e
|
|||||||
|
|
||||||
if fresh_response.GetCursorBottom() == last_response.GetCursorBottom() && len(fresh_response.GetMainInstruction().Entries) == 0 {
|
if fresh_response.GetCursorBottom() == last_response.GetCursorBottom() && len(fresh_response.GetMainInstruction().Entries) == 0 {
|
||||||
// Empty response, cursor same as previous: end of feed has been reached
|
// Empty response, cursor same as previous: end of feed has been reached
|
||||||
|
fmt.Printf("Cursor repeated; EOF\n")
|
||||||
return END_OF_FEED
|
return END_OF_FEED
|
||||||
}
|
}
|
||||||
if fresh_response.IsEmpty() {
|
if fresh_response.IsEmpty() {
|
||||||
// Response has a pinned tweet, but no other content: end of feed has been reached
|
// Response has a pinned tweet, but no other content: end of feed has been reached
|
||||||
|
fmt.Printf("No non-pinned-tweet entries; EOF\n")
|
||||||
return END_OF_FEED // TODO: check that there actually is a pinned tweet and the request didn't just fail lol
|
return END_OF_FEED // TODO: check that there actually is a pinned tweet and the request didn't just fail lol
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -839,6 +839,31 @@ func (api *API) GetMore(pq PaginatedQuery, response *APIV2Response, count int) e
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (api *API) GetPaginatedQuery(pq PaginatedQuery, count int) (TweetTrove, error) {
|
||||||
|
fmt.Printf("Paginating %d count\n", count)
|
||||||
|
api_response, err := pq.NextPage(api, "")
|
||||||
|
if err != nil {
|
||||||
|
return TweetTrove{}, fmt.Errorf("Error calling API to fetch query %#v:\n %w", pq, err)
|
||||||
|
}
|
||||||
|
if len(api_response.GetMainInstruction().Entries) < count && api_response.GetCursorBottom() != "" {
|
||||||
|
err = api.GetMore(pq, &api_response, count)
|
||||||
|
if errors.Is(err, END_OF_FEED) {
|
||||||
|
println("End of feed!")
|
||||||
|
} else if err != nil {
|
||||||
|
return TweetTrove{}, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
trove, err := pq.ToTweetTrove(api_response)
|
||||||
|
if err != nil {
|
||||||
|
return TweetTrove{}, fmt.Errorf("Error parsing the tweet trove for query %#v:\n %w", pq, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Println("------------")
|
||||||
|
err = trove.PostProcess()
|
||||||
|
return trove, err
|
||||||
|
}
|
||||||
|
|
||||||
// Get a User feed using the new GraphQL twitter api
|
// Get a User feed using the new GraphQL twitter api
|
||||||
func (api *API) GetGraphqlFeedFor(user_id UserID, cursor string) (APIV2Response, error) {
|
func (api *API) GetGraphqlFeedFor(user_id UserID, cursor string) (APIV2Response, error) {
|
||||||
url, err := url.Parse(GraphqlURL{
|
url, err := url.Parse(GraphqlURL{
|
||||||
@ -877,6 +902,9 @@ type PaginatedUserFeed struct {
|
|||||||
func (p PaginatedUserFeed) NextPage(api *API, cursor string) (APIV2Response, error) {
|
func (p PaginatedUserFeed) NextPage(api *API, cursor string) (APIV2Response, error) {
|
||||||
return api.GetGraphqlFeedFor(p.user_id, cursor)
|
return api.GetGraphqlFeedFor(p.user_id, cursor)
|
||||||
}
|
}
|
||||||
|
func (p PaginatedUserFeed) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
|
||||||
|
return r.ToTweetTrove()
|
||||||
|
}
|
||||||
|
|
||||||
func (api *API) GetTweetDetail(tweet_id TweetID, cursor string) (APIV2Response, error) {
|
func (api *API) GetTweetDetail(tweet_id TweetID, cursor string) (APIV2Response, error) {
|
||||||
url, err := url.Parse(GraphqlURL{
|
url, err := url.Parse(GraphqlURL{
|
||||||
@ -919,6 +947,12 @@ func (api *API) GetTweetDetail(tweet_id TweetID, cursor string) (APIV2Response,
|
|||||||
|
|
||||||
var response APIV2Response
|
var response APIV2Response
|
||||||
err = api.do_http(url.String(), cursor, &response)
|
err = api.do_http(url.String(), cursor, &response)
|
||||||
|
if len(response.Errors) != 0 {
|
||||||
|
if response.Errors[0].Message == "_Missing: No status found with that ID." {
|
||||||
|
return response, ErrDoesntExist
|
||||||
|
}
|
||||||
|
return response, fmt.Errorf("%w: %s", EXTERNAL_API_ERROR, response.Errors[0].Message)
|
||||||
|
}
|
||||||
|
|
||||||
return response, err
|
return response, err
|
||||||
}
|
}
|
||||||
@ -930,6 +964,9 @@ type PaginatedTweetReplies struct {
|
|||||||
func (p PaginatedTweetReplies) NextPage(api *API, cursor string) (APIV2Response, error) {
|
func (p PaginatedTweetReplies) NextPage(api *API, cursor string) (APIV2Response, error) {
|
||||||
return api.GetTweetDetail(p.tweet_id, cursor)
|
return api.GetTweetDetail(p.tweet_id, cursor)
|
||||||
}
|
}
|
||||||
|
func (p PaginatedTweetReplies) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
|
||||||
|
return r.ToTweetTrove()
|
||||||
|
}
|
||||||
|
|
||||||
func (api *API) GetUserLikes(user_id UserID, cursor string) (APIV2Response, error) {
|
func (api *API) GetUserLikes(user_id UserID, cursor string) (APIV2Response, error) {
|
||||||
url, err := url.Parse(GraphqlURL{
|
url, err := url.Parse(GraphqlURL{
|
||||||
@ -984,33 +1021,23 @@ type PaginatedUserLikes struct {
|
|||||||
func (p PaginatedUserLikes) NextPage(api *API, cursor string) (APIV2Response, error) {
|
func (p PaginatedUserLikes) NextPage(api *API, cursor string) (APIV2Response, error) {
|
||||||
return api.GetUserLikes(p.user_id, cursor)
|
return api.GetUserLikes(p.user_id, cursor)
|
||||||
}
|
}
|
||||||
|
func (p PaginatedUserLikes) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
|
||||||
func GetUserLikes(user_id UserID, how_many int) (TweetTrove, error) {
|
ret, err := r.ToTweetTroveAsLikes()
|
||||||
response, err := the_api.GetUserLikes(user_id, "")
|
|
||||||
if err != nil {
|
|
||||||
return TweetTrove{}, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(response.GetMainInstruction().Entries) < how_many && response.GetCursorBottom() != "" {
|
|
||||||
err = the_api.GetMore(PaginatedUserLikes{user_id}, &response, how_many)
|
|
||||||
if errors.Is(err, END_OF_FEED) {
|
|
||||||
println("End of feed!")
|
|
||||||
} else if err != nil {
|
|
||||||
return TweetTrove{}, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
trove, err := response.ToTweetTroveAsLikes()
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return TweetTrove{}, err
|
return TweetTrove{}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fill out the liking UserID
|
// Fill out the liking UserID
|
||||||
for i := range trove.Likes {
|
for i := range ret.Likes {
|
||||||
l := trove.Likes[i]
|
l := ret.Likes[i]
|
||||||
l.UserID = user_id
|
l.UserID = p.user_id
|
||||||
trove.Likes[i] = l
|
ret.Likes[i] = l
|
||||||
}
|
}
|
||||||
return trove, nil
|
return ret, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetUserLikes(user_id UserID, how_many int) (TweetTrove, error) {
|
||||||
|
return the_api.GetPaginatedQuery(PaginatedUserLikes{user_id}, how_many)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (api *API) GetHomeTimeline(cursor string, is_for_you bool) (TweetTrove, error) {
|
func (api *API) GetHomeTimeline(cursor string, is_for_you bool) (TweetTrove, error) {
|
||||||
@ -1175,3 +1202,6 @@ type PaginatedSearch struct {
|
|||||||
func (p PaginatedSearch) NextPage(api *API, cursor string) (APIV2Response, error) {
|
func (p PaginatedSearch) NextPage(api *API, cursor string) (APIV2Response, error) {
|
||||||
return api.Search(p.query, cursor)
|
return api.Search(p.query, cursor)
|
||||||
}
|
}
|
||||||
|
func (p PaginatedSearch) ToTweetTrove(r APIV2Response) (TweetTrove, error) {
|
||||||
|
return r.ToTweetTrove()
|
||||||
|
}
|
||||||
|
@ -1,10 +1,5 @@
|
|||||||
package scraper
|
package scraper
|
||||||
|
|
||||||
import (
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TimestampToDateString(timestamp int) string {
|
func TimestampToDateString(timestamp int) string {
|
||||||
panic("???") // TODO
|
panic("???") // TODO
|
||||||
}
|
}
|
||||||
@ -18,28 +13,5 @@ func TimestampToDateString(timestamp int) string {
|
|||||||
* - videos
|
* - videos
|
||||||
*/
|
*/
|
||||||
func Search(query string, min_results int) (trove TweetTrove, err error) {
|
func Search(query string, min_results int) (trove TweetTrove, err error) {
|
||||||
api_response, err := the_api.Search(query, "")
|
return the_api.GetPaginatedQuery(PaginatedSearch{query}, min_results)
|
||||||
if err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(api_response.GetMainInstruction().Entries) < min_results && api_response.GetCursorBottom() != "" {
|
|
||||||
err = the_api.GetMore(PaginatedSearch{query}, &api_response, min_results)
|
|
||||||
if errors.Is(err, END_OF_FEED) {
|
|
||||||
println("End of feed!")
|
|
||||||
} else if err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
trove, err = api_response.ToTweetTrove()
|
|
||||||
if err != nil {
|
|
||||||
err = fmt.Errorf("Error parsing the tweet trove for search query %q:\n %w", query, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// Filling tombstones and tombstoned users is probably not necessary here, but we still
|
|
||||||
// need to fetch Spaces
|
|
||||||
err = trove.PostProcess()
|
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
@ -327,39 +327,16 @@ func GetTweetFull(id TweetID, how_many int) (trove TweetTrove, err error) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetTweetFullAPIV2(id TweetID, how_many int) (trove TweetTrove, err error) {
|
func GetTweetFullAPIV2(id TweetID, how_many int) (TweetTrove, error) {
|
||||||
resp, err := the_api.GetTweetDetail(id, "")
|
trove, err := the_api.GetPaginatedQuery(PaginatedTweetReplies{id}, how_many)
|
||||||
if err != nil {
|
if errors.Is(err, ErrDoesntExist) {
|
||||||
err = fmt.Errorf("Error getting tweet detail: %d\n %w", id, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if len(resp.Errors) != 0 {
|
|
||||||
if resp.Errors[0].Message == "_Missing: No status found with that ID." {
|
|
||||||
trove := NewTweetTrove()
|
trove := NewTweetTrove()
|
||||||
trove.Tweets[id] = Tweet{ID: id, TombstoneType: "deleted", IsConversationScraped: true}
|
trove.Tweets[id] = Tweet{ID: id, TombstoneType: "deleted", IsConversationScraped: true}
|
||||||
return trove, nil
|
return trove, nil
|
||||||
}
|
} else if err != nil {
|
||||||
panic(fmt.Sprintf("Unknown error: %s", resp.Errors[0].Message))
|
|
||||||
}
|
|
||||||
|
|
||||||
err = the_api.GetMore(PaginatedTweetReplies{id}, &resp, how_many)
|
|
||||||
if err != nil && !errors.Is(err, END_OF_FEED) {
|
|
||||||
err = fmt.Errorf("Error getting more replies in tweet detail: %d\n %w", id, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
trove, err = resp.ToTweetTrove()
|
|
||||||
if err != nil {
|
|
||||||
return trove, err
|
return trove, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Quoted tombstones need their user_id filled out from the tombstoned_users list
|
|
||||||
log.Debug("Running tweet trove post-processing\n")
|
|
||||||
err = trove.PostProcess()
|
|
||||||
if err != nil {
|
|
||||||
err = fmt.Errorf("Error getting tweet (id %d):\n %w", id, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at"
|
// Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at"
|
||||||
tweet, ok := trove.Tweets[id]
|
tweet, ok := trove.Tweets[id]
|
||||||
if !ok {
|
if !ok {
|
||||||
@ -369,5 +346,5 @@ func GetTweetFullAPIV2(id TweetID, how_many int) (trove TweetTrove, err error) {
|
|||||||
tweet.IsConversationScraped = true
|
tweet.IsConversationScraped = true
|
||||||
trove.Tweets[id] = tweet
|
trove.Tweets[id] = tweet
|
||||||
|
|
||||||
return
|
return trove, err
|
||||||
}
|
}
|
||||||
|
@ -33,25 +33,5 @@ func GetUserFeedFor(user_id UserID, min_tweets int) (trove TweetTrove, err error
|
|||||||
}
|
}
|
||||||
|
|
||||||
func GetUserFeedGraphqlFor(user_id UserID, min_tweets int) (trove TweetTrove, err error) {
|
func GetUserFeedGraphqlFor(user_id UserID, min_tweets int) (trove TweetTrove, err error) {
|
||||||
api_response, err := the_api.GetGraphqlFeedFor(user_id, "")
|
return the_api.GetPaginatedQuery(PaginatedUserFeed{user_id}, min_tweets)
|
||||||
if err != nil {
|
|
||||||
err = fmt.Errorf("Error calling API to fetch user feed: UserID %d\n %w", user_id, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(api_response.GetMainInstruction().Entries) < min_tweets && api_response.GetCursorBottom() != "" {
|
|
||||||
err = the_api.GetMore(PaginatedUserFeed{user_id}, &api_response, min_tweets)
|
|
||||||
if err != nil && !errors.Is(err, END_OF_FEED) {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
trove, err = api_response.ToTweetTrove()
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Println("------------")
|
|
||||||
err = trove.PostProcess()
|
|
||||||
return trove, err
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user