Some housekeeping

This commit is contained in:
Alessio 2022-11-13 12:04:25 -05:00
parent b78cef34ce
commit d54e77b169
9 changed files with 51 additions and 40 deletions

View File

@ -61,7 +61,7 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
)
if err != nil {
return fmt.Errorf("Error executing SaveTweet(ID %d):\n %w", t.ID, err)
return fmt.Errorf("Error executing SaveTweet(ID %d). Info: %#v:\n %w", t.ID, t, err)
}
for _, url := range t.Urls {
err := p.SaveUrl(url)

View File

@ -18,6 +18,19 @@ func (p Profile) SaveTweetTrove(trove TweetTrove) {
}
fmt.Println(u.Handle, u.ID)
// If the User's ID was updated in saving (i.e., Unknown User), update it in the Trove too
// Also update tweets, retweets and spaces that reference this UserID
for j, tweet := range trove.Tweets {
if tweet.UserID == trove.Users[i].ID {
tweet.UserID = u.ID
trove.Tweets[j] = tweet
}
}
for j, retweet := range trove.Retweets {
if retweet.RetweetedByID == trove.Users[i].ID {
retweet.RetweetedByID = u.ID
trove.Retweets[j] = retweet
}
}
trove.Users[i] = u
// Download their tiny profile image
@ -28,7 +41,9 @@ func (p Profile) SaveTweetTrove(trove TweetTrove) {
}
// TODO: this is called earlier in the process as well, before parsing. Is that call redundant? Too tired to figure out right now
trove.FillMissingUserIDs()
// Update: Yes it's redundant. Places that return tweet troves should call `PostProcess`
// before returning, which includes `FillMissingUserIDs`.
// trove.FillMissingUserIDs()
for _, t := range trove.Tweets {
err := p.SaveTweet(t)

View File

@ -23,7 +23,9 @@ func (p Profile) SaveUser(u *scraper.User) error {
// We need to continue-- create a new fake user
u.ID = p.NextFakeUserID()
} else if err == nil {
// We're done; everything is fine (ID has already been scanned into the User)
// We're done; a user exists with this handle already. No need to fake anything, and we have no new data
// to provide (since the ID is fake).
// ID has already been scanned into the User, for use by the caller.
return nil
} else {
// A real error occurred

View File

@ -220,7 +220,7 @@ func (api API) GetUser(handle UserHandle) (APIUser, error) {
if err != nil {
return APIUser{}, fmt.Errorf("Error retrieving API response to GetUser(%s):\n %w", handle, err)
}
log.Debug(string(body))
log.Debug("GetUser(" + string(handle) + "): " + string(body))
err = json.Unmarshal(body, &response)
if err != nil {

View File

@ -2,6 +2,7 @@ package scraper
import (
"errors"
"fmt"
)
func TimestampToDateString(timestamp int) string {
@ -32,5 +33,14 @@ func Search(query string, min_results int) (trove TweetTrove, err error) {
}
}
return ParseTweetResponse(tweet_response)
trove, err = ParseTweetResponse(tweet_response)
if err != nil {
err = fmt.Errorf("Error parsing the tweet trove for search query %q:\n %w", query, err)
return
}
// Filling tombstones and tombstoned users is probably not necessary here, but we still
// need to fetch Spaces
err = trove.PostProcess()
return
}

View File

@ -256,10 +256,13 @@ func GetTweetFull(id TweetID) (trove TweetTrove, err error) {
panic(err)
}
trove.TombstoneUsers = tombstoned_users
trove.FetchTombstoneUsers()
// Quoted tombstones need their user_id filled out from the tombstoned_users list
trove.FillMissingUserIDs()
err = trove.PostProcess()
if err != nil {
err = fmt.Errorf("Error getting tweet (id %d):\n %w", id, err)
return
}
// Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at"
tweet, ok := trove.Tweets[id]
@ -270,8 +273,6 @@ func GetTweetFull(id TweetID) (trove TweetTrove, err error) {
tweet.IsConversationScraped = true
trove.Tweets[id] = tweet
// tweets, retweets, users = trove.Transform()
return
}

View File

@ -227,11 +227,10 @@ func TestParseTweetResponse(t *testing.T) {
trove, err := ParseTweetResponse(tweet_resp)
require.NoError(t, err)
tweets, retweets, users := trove.Transform()
assert.Len(tweets, 29-3)
assert.Len(retweets, 3)
assert.Len(users, 9)
assert.Len(trove.Tweets, 29-3)
assert.Len(trove.Retweets, 3)
assert.Len(trove.Users, 9)
}
func TestParseTweetResponseWithTombstones(t *testing.T) {
@ -249,9 +248,8 @@ func TestParseTweetResponseWithTombstones(t *testing.T) {
trove, err := ParseTweetResponse(tweet_resp)
require.NoError(t, err)
tweets, retweets, users := trove.Transform()
assert.Len(tweets, 2)
assert.Len(retweets, 0)
assert.Len(users, 1)
assert.Len(trove.Tweets, 2)
assert.Len(trove.Retweets, 0)
assert.Len(trove.Users, 1)
}

View File

@ -24,22 +24,6 @@ func NewTweetTrove() TweetTrove {
return ret
}
/**
* Make it compatible with previous silly interface if needed
*/
func (trove TweetTrove) Transform() (tweets []Tweet, retweets []Retweet, users []User) {
for _, val := range trove.Tweets {
tweets = append(tweets, val)
}
for _, val := range trove.Users {
users = append(users, val)
}
for _, val := range trove.Retweets {
retweets = append(retweets, val)
}
return
} // TODO: refactor until this function isn't needed anymore
/**
* Search for a user by handle. Second param is whether the user was found or not.
*/
@ -107,9 +91,6 @@ func (trove *TweetTrove) FetchTombstoneUsers() {
*
* At this point, those users should have been added to this trove's Users collection, and the
* Tweets have a field `UserHandle` which can be used to pair them with newly fetched Users.
*
* This will still fail if the user deleted their account (instead of getting banned, blocking the
* quote-tweeter, etc), because then that user won't show up .
*/
func (trove *TweetTrove) FillMissingUserIDs() {
for i := range trove.Tweets {
@ -133,3 +114,9 @@ func (trove *TweetTrove) FillMissingUserIDs() {
trove.Tweets[i] = tweet
}
}
func (trove *TweetTrove) PostProcess() error {
trove.FetchTombstoneUsers()
trove.FillMissingUserIDs()
return nil
}

View File

@ -54,8 +54,6 @@ func GetUserFeedGraphqlFor(user_id UserID, min_tweets int) (trove TweetTrove, er
}
fmt.Println("------------")
trove.FetchTombstoneUsers()
trove.FillMissingUserIDs()
return trove, nil
err = trove.PostProcess()
return trove, err
}