Some housekeeping

This commit is contained in:
Alessio 2022-11-13 12:04:25 -05:00
parent b78cef34ce
commit d54e77b169
9 changed files with 51 additions and 40 deletions

View File

@ -61,7 +61,7 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
) )
if err != nil { if err != nil {
return fmt.Errorf("Error executing SaveTweet(ID %d):\n %w", t.ID, err) return fmt.Errorf("Error executing SaveTweet(ID %d). Info: %#v:\n %w", t.ID, t, err)
} }
for _, url := range t.Urls { for _, url := range t.Urls {
err := p.SaveUrl(url) err := p.SaveUrl(url)

View File

@ -18,6 +18,19 @@ func (p Profile) SaveTweetTrove(trove TweetTrove) {
} }
fmt.Println(u.Handle, u.ID) fmt.Println(u.Handle, u.ID)
// If the User's ID was updated in saving (i.e., Unknown User), update it in the Trove too // If the User's ID was updated in saving (i.e., Unknown User), update it in the Trove too
// Also update tweets, retweets and spaces that reference this UserID
for j, tweet := range trove.Tweets {
if tweet.UserID == trove.Users[i].ID {
tweet.UserID = u.ID
trove.Tweets[j] = tweet
}
}
for j, retweet := range trove.Retweets {
if retweet.RetweetedByID == trove.Users[i].ID {
retweet.RetweetedByID = u.ID
trove.Retweets[j] = retweet
}
}
trove.Users[i] = u trove.Users[i] = u
// Download their tiny profile image // Download their tiny profile image
@ -28,7 +41,9 @@ func (p Profile) SaveTweetTrove(trove TweetTrove) {
} }
// TODO: this is called earlier in the process as well, before parsing. Is that call redundant? Too tired to figure out right now // TODO: this is called earlier in the process as well, before parsing. Is that call redundant? Too tired to figure out right now
trove.FillMissingUserIDs() // Update: Yes it's redundant. Places that return tweet troves should call `PostProcess`
// before returning, which includes `FillMissingUserIDs`.
// trove.FillMissingUserIDs()
for _, t := range trove.Tweets { for _, t := range trove.Tweets {
err := p.SaveTweet(t) err := p.SaveTweet(t)

View File

@ -23,7 +23,9 @@ func (p Profile) SaveUser(u *scraper.User) error {
// We need to continue-- create a new fake user // We need to continue-- create a new fake user
u.ID = p.NextFakeUserID() u.ID = p.NextFakeUserID()
} else if err == nil { } else if err == nil {
// We're done; everything is fine (ID has already been scanned into the User) // We're done; a user exists with this handle already. No need to fake anything, and we have no new data
// to provide (since the ID is fake).
// ID has already been scanned into the User, for use by the caller.
return nil return nil
} else { } else {
// A real error occurred // A real error occurred

View File

@ -220,7 +220,7 @@ func (api API) GetUser(handle UserHandle) (APIUser, error) {
if err != nil { if err != nil {
return APIUser{}, fmt.Errorf("Error retrieving API response to GetUser(%s):\n %w", handle, err) return APIUser{}, fmt.Errorf("Error retrieving API response to GetUser(%s):\n %w", handle, err)
} }
log.Debug(string(body)) log.Debug("GetUser(" + string(handle) + "): " + string(body))
err = json.Unmarshal(body, &response) err = json.Unmarshal(body, &response)
if err != nil { if err != nil {

View File

@ -2,6 +2,7 @@ package scraper
import ( import (
"errors" "errors"
"fmt"
) )
func TimestampToDateString(timestamp int) string { func TimestampToDateString(timestamp int) string {
@ -32,5 +33,14 @@ func Search(query string, min_results int) (trove TweetTrove, err error) {
} }
} }
return ParseTweetResponse(tweet_response) trove, err = ParseTweetResponse(tweet_response)
if err != nil {
err = fmt.Errorf("Error parsing the tweet trove for search query %q:\n %w", query, err)
return
}
// Filling tombstones and tombstoned users is probably not necessary here, but we still
// need to fetch Spaces
err = trove.PostProcess()
return
} }

View File

@ -256,10 +256,13 @@ func GetTweetFull(id TweetID) (trove TweetTrove, err error) {
panic(err) panic(err)
} }
trove.TombstoneUsers = tombstoned_users trove.TombstoneUsers = tombstoned_users
trove.FetchTombstoneUsers()
// Quoted tombstones need their user_id filled out from the tombstoned_users list // Quoted tombstones need their user_id filled out from the tombstoned_users list
trove.FillMissingUserIDs() err = trove.PostProcess()
if err != nil {
err = fmt.Errorf("Error getting tweet (id %d):\n %w", id, err)
return
}
// Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at" // Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at"
tweet, ok := trove.Tweets[id] tweet, ok := trove.Tweets[id]
@ -270,8 +273,6 @@ func GetTweetFull(id TweetID) (trove TweetTrove, err error) {
tweet.IsConversationScraped = true tweet.IsConversationScraped = true
trove.Tweets[id] = tweet trove.Tweets[id] = tweet
// tweets, retweets, users = trove.Transform()
return return
} }

View File

@ -227,11 +227,10 @@ func TestParseTweetResponse(t *testing.T) {
trove, err := ParseTweetResponse(tweet_resp) trove, err := ParseTweetResponse(tweet_resp)
require.NoError(t, err) require.NoError(t, err)
tweets, retweets, users := trove.Transform()
assert.Len(tweets, 29-3) assert.Len(trove.Tweets, 29-3)
assert.Len(retweets, 3) assert.Len(trove.Retweets, 3)
assert.Len(users, 9) assert.Len(trove.Users, 9)
} }
func TestParseTweetResponseWithTombstones(t *testing.T) { func TestParseTweetResponseWithTombstones(t *testing.T) {
@ -249,9 +248,8 @@ func TestParseTweetResponseWithTombstones(t *testing.T) {
trove, err := ParseTweetResponse(tweet_resp) trove, err := ParseTweetResponse(tweet_resp)
require.NoError(t, err) require.NoError(t, err)
tweets, retweets, users := trove.Transform()
assert.Len(tweets, 2) assert.Len(trove.Tweets, 2)
assert.Len(retweets, 0) assert.Len(trove.Retweets, 0)
assert.Len(users, 1) assert.Len(trove.Users, 1)
} }

View File

@ -24,22 +24,6 @@ func NewTweetTrove() TweetTrove {
return ret return ret
} }
/**
* Make it compatible with previous silly interface if needed
*/
func (trove TweetTrove) Transform() (tweets []Tweet, retweets []Retweet, users []User) {
for _, val := range trove.Tweets {
tweets = append(tweets, val)
}
for _, val := range trove.Users {
users = append(users, val)
}
for _, val := range trove.Retweets {
retweets = append(retweets, val)
}
return
} // TODO: refactor until this function isn't needed anymore
/** /**
* Search for a user by handle. Second param is whether the user was found or not. * Search for a user by handle. Second param is whether the user was found or not.
*/ */
@ -107,9 +91,6 @@ func (trove *TweetTrove) FetchTombstoneUsers() {
* *
* At this point, those users should have been added to this trove's Users collection, and the * At this point, those users should have been added to this trove's Users collection, and the
* Tweets have a field `UserHandle` which can be used to pair them with newly fetched Users. * Tweets have a field `UserHandle` which can be used to pair them with newly fetched Users.
*
* This will still fail if the user deleted their account (instead of getting banned, blocking the
* quote-tweeter, etc), because then that user won't show up .
*/ */
func (trove *TweetTrove) FillMissingUserIDs() { func (trove *TweetTrove) FillMissingUserIDs() {
for i := range trove.Tweets { for i := range trove.Tweets {
@ -133,3 +114,9 @@ func (trove *TweetTrove) FillMissingUserIDs() {
trove.Tweets[i] = tweet trove.Tweets[i] = tweet
} }
} }
func (trove *TweetTrove) PostProcess() error {
trove.FetchTombstoneUsers()
trove.FillMissingUserIDs()
return nil
}

View File

@ -54,8 +54,6 @@ func GetUserFeedGraphqlFor(user_id UserID, min_tweets int) (trove TweetTrove, er
} }
fmt.Println("------------") fmt.Println("------------")
trove.FetchTombstoneUsers() err = trove.PostProcess()
trove.FillMissingUserIDs() return trove, err
return trove, nil
} }