From d54e77b1693ded1fa864022a7cf2579a737035ee Mon Sep 17 00:00:00 2001 From: Alessio Date: Sun, 13 Nov 2022 12:04:25 -0500 Subject: [PATCH] Some housekeeping --- persistence/tweet_queries.go | 2 +- persistence/tweet_trove_queries.go | 17 ++++++++++++++++- persistence/user_queries.go | 4 +++- scraper/api_request_utils.go | 2 +- scraper/search.go | 12 +++++++++++- scraper/tweet.go | 9 +++++---- scraper/tweet_test.go | 14 ++++++-------- scraper/tweet_trove.go | 25 ++++++------------------- scraper/user_feed.go | 6 ++---- 9 files changed, 51 insertions(+), 40 deletions(-) diff --git a/persistence/tweet_queries.go b/persistence/tweet_queries.go index f328103..a731b14 100644 --- a/persistence/tweet_queries.go +++ b/persistence/tweet_queries.go @@ -61,7 +61,7 @@ func (p Profile) SaveTweet(t scraper.Tweet) error { ) if err != nil { - return fmt.Errorf("Error executing SaveTweet(ID %d):\n %w", t.ID, err) + return fmt.Errorf("Error executing SaveTweet(ID %d). Info: %#v:\n %w", t.ID, t, err) } for _, url := range t.Urls { err := p.SaveUrl(url) diff --git a/persistence/tweet_trove_queries.go b/persistence/tweet_trove_queries.go index 9c51773..e89993b 100644 --- a/persistence/tweet_trove_queries.go +++ b/persistence/tweet_trove_queries.go @@ -18,6 +18,19 @@ func (p Profile) SaveTweetTrove(trove TweetTrove) { } fmt.Println(u.Handle, u.ID) // If the User's ID was updated in saving (i.e., Unknown User), update it in the Trove too + // Also update tweets, retweets and spaces that reference this UserID + for j, tweet := range trove.Tweets { + if tweet.UserID == trove.Users[i].ID { + tweet.UserID = u.ID + trove.Tweets[j] = tweet + } + } + for j, retweet := range trove.Retweets { + if retweet.RetweetedByID == trove.Users[i].ID { + retweet.RetweetedByID = u.ID + trove.Retweets[j] = retweet + } + } trove.Users[i] = u // Download their tiny profile image @@ -28,7 +41,9 @@ func (p Profile) SaveTweetTrove(trove TweetTrove) { } // TODO: this is called earlier in the process as well, before parsing. Is that call redundant? Too tired to figure out right now - trove.FillMissingUserIDs() + // Update: Yes it's redundant. Places that return tweet troves should call `PostProcess` + // before returning, which includes `FillMissingUserIDs`. + // trove.FillMissingUserIDs() for _, t := range trove.Tweets { err := p.SaveTweet(t) diff --git a/persistence/user_queries.go b/persistence/user_queries.go index 6a43e1f..4323102 100644 --- a/persistence/user_queries.go +++ b/persistence/user_queries.go @@ -23,7 +23,9 @@ func (p Profile) SaveUser(u *scraper.User) error { // We need to continue-- create a new fake user u.ID = p.NextFakeUserID() } else if err == nil { - // We're done; everything is fine (ID has already been scanned into the User) + // We're done; a user exists with this handle already. No need to fake anything, and we have no new data + // to provide (since the ID is fake). + // ID has already been scanned into the User, for use by the caller. return nil } else { // A real error occurred diff --git a/scraper/api_request_utils.go b/scraper/api_request_utils.go index 2399881..73c9416 100644 --- a/scraper/api_request_utils.go +++ b/scraper/api_request_utils.go @@ -220,7 +220,7 @@ func (api API) GetUser(handle UserHandle) (APIUser, error) { if err != nil { return APIUser{}, fmt.Errorf("Error retrieving API response to GetUser(%s):\n %w", handle, err) } - log.Debug(string(body)) + log.Debug("GetUser(" + string(handle) + "): " + string(body)) err = json.Unmarshal(body, &response) if err != nil { diff --git a/scraper/search.go b/scraper/search.go index 696c735..55b37fa 100644 --- a/scraper/search.go +++ b/scraper/search.go @@ -2,6 +2,7 @@ package scraper import ( "errors" + "fmt" ) func TimestampToDateString(timestamp int) string { @@ -32,5 +33,14 @@ func Search(query string, min_results int) (trove TweetTrove, err error) { } } - return ParseTweetResponse(tweet_response) + trove, err = ParseTweetResponse(tweet_response) + if err != nil { + err = fmt.Errorf("Error parsing the tweet trove for search query %q:\n %w", query, err) + return + } + + // Filling tombstones and tombstoned users is probably not necessary here, but we still + // need to fetch Spaces + err = trove.PostProcess() + return } diff --git a/scraper/tweet.go b/scraper/tweet.go index 7aed758..8880c42 100644 --- a/scraper/tweet.go +++ b/scraper/tweet.go @@ -256,10 +256,13 @@ func GetTweetFull(id TweetID) (trove TweetTrove, err error) { panic(err) } trove.TombstoneUsers = tombstoned_users - trove.FetchTombstoneUsers() // Quoted tombstones need their user_id filled out from the tombstoned_users list - trove.FillMissingUserIDs() + err = trove.PostProcess() + if err != nil { + err = fmt.Errorf("Error getting tweet (id %d):\n %w", id, err) + return + } // Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at" tweet, ok := trove.Tweets[id] @@ -270,8 +273,6 @@ func GetTweetFull(id TweetID) (trove TweetTrove, err error) { tweet.IsConversationScraped = true trove.Tweets[id] = tweet - // tweets, retweets, users = trove.Transform() - return } diff --git a/scraper/tweet_test.go b/scraper/tweet_test.go index 0f2e255..87e2e2a 100644 --- a/scraper/tweet_test.go +++ b/scraper/tweet_test.go @@ -227,11 +227,10 @@ func TestParseTweetResponse(t *testing.T) { trove, err := ParseTweetResponse(tweet_resp) require.NoError(t, err) - tweets, retweets, users := trove.Transform() - assert.Len(tweets, 29-3) - assert.Len(retweets, 3) - assert.Len(users, 9) + assert.Len(trove.Tweets, 29-3) + assert.Len(trove.Retweets, 3) + assert.Len(trove.Users, 9) } func TestParseTweetResponseWithTombstones(t *testing.T) { @@ -249,9 +248,8 @@ func TestParseTweetResponseWithTombstones(t *testing.T) { trove, err := ParseTweetResponse(tweet_resp) require.NoError(t, err) - tweets, retweets, users := trove.Transform() - assert.Len(tweets, 2) - assert.Len(retweets, 0) - assert.Len(users, 1) + assert.Len(trove.Tweets, 2) + assert.Len(trove.Retweets, 0) + assert.Len(trove.Users, 1) } diff --git a/scraper/tweet_trove.go b/scraper/tweet_trove.go index 3b263ab..6c11e51 100644 --- a/scraper/tweet_trove.go +++ b/scraper/tweet_trove.go @@ -24,22 +24,6 @@ func NewTweetTrove() TweetTrove { return ret } -/** - * Make it compatible with previous silly interface if needed - */ -func (trove TweetTrove) Transform() (tweets []Tweet, retweets []Retweet, users []User) { - for _, val := range trove.Tweets { - tweets = append(tweets, val) - } - for _, val := range trove.Users { - users = append(users, val) - } - for _, val := range trove.Retweets { - retweets = append(retweets, val) - } - return -} // TODO: refactor until this function isn't needed anymore - /** * Search for a user by handle. Second param is whether the user was found or not. */ @@ -107,9 +91,6 @@ func (trove *TweetTrove) FetchTombstoneUsers() { * * At this point, those users should have been added to this trove's Users collection, and the * Tweets have a field `UserHandle` which can be used to pair them with newly fetched Users. - * - * This will still fail if the user deleted their account (instead of getting banned, blocking the - * quote-tweeter, etc), because then that user won't show up . */ func (trove *TweetTrove) FillMissingUserIDs() { for i := range trove.Tweets { @@ -133,3 +114,9 @@ func (trove *TweetTrove) FillMissingUserIDs() { trove.Tweets[i] = tweet } } + +func (trove *TweetTrove) PostProcess() error { + trove.FetchTombstoneUsers() + trove.FillMissingUserIDs() + return nil +} diff --git a/scraper/user_feed.go b/scraper/user_feed.go index 332a35e..ce1305e 100644 --- a/scraper/user_feed.go +++ b/scraper/user_feed.go @@ -54,8 +54,6 @@ func GetUserFeedGraphqlFor(user_id UserID, min_tweets int) (trove TweetTrove, er } fmt.Println("------------") - trove.FetchTombstoneUsers() - trove.FillMissingUserIDs() - - return trove, nil + err = trove.PostProcess() + return trove, err }