Refactor all scraper areas to use TweetTroves where applicable

This commit is contained in:
Alessio 2022-02-12 20:39:30 -08:00
parent c53d8ae24c
commit e0f05cbeea
7 changed files with 98 additions and 79 deletions

View File

@ -172,10 +172,11 @@ func fetch_tweet_conversation(tweet_identifier string) {
fmt.Println("Tweet is already in database. Updating...")
}
tweets, _, users, err := scraper.GetTweetFull(tweet_id)
trove, err := scraper.GetTweetFull(tweet_id)
if err != nil {
die(err.Error(), false, -1)
}
tweets, _, users := trove.Transform()
for _, u := range users {
fmt.Println(u.Handle)
@ -285,10 +286,11 @@ func download_user_content(handle scraper.UserHandle) {
func search(query string) {
tweets, retweets, users, err := scraper.Search(query, 1000);
trove, err := scraper.Search(query, 1000)
if err != nil {
die("Error scraping search results: " + err.Error(), false, -100)
}
tweets, retweets, users := trove.Transform()
for _, u := range users {
fmt.Println(u.Handle)

View File

@ -13,7 +13,7 @@ func TimestampToDateString(timestamp int) string {
* - photos
* - videos
*/
func Search(query string, min_results int) (tweets []Tweet, retweets []Retweet, users []User, err error) {
func Search(query string, min_results int) (trove TweetTrove, err error) {
api := API{}
tweet_response, err := api.Search(query, "")
if err != nil {

View File

@ -213,7 +213,7 @@ func GetTweet(id TweetID) (Tweet, error) {
*
* returns: the tweet, list of its replies and context, and users associated with those replies
*/
func GetTweetFull(id TweetID) (tweets []Tweet, retweets []Retweet, users []User, err error) {
func GetTweetFull(id TweetID) (trove TweetTrove, err error) {
api := API{}
tweet_response, err := api.GetTweet(id, "")
if err != nil {
@ -226,49 +226,31 @@ func GetTweetFull(id TweetID) (tweets []Tweet, retweets []Retweet, users []User,
return
}
}
tombstone_users := tweet_response.HandleTombstones()
fmt.Printf("%v\n", tombstone_users)
for _, u := range tombstone_users {
fetched_user, err1 := GetUser(u)
fetched_user.Handle = u
if err != nil {
err = err1
return
}
fmt.Println(fetched_user)
users = append(users, fetched_user)
// This has to be called BEFORE ParseTweetResponse, because it modifies the TweetResponse (adds tombstone tweets to its tweets list)
tombstoned_users := tweet_response.HandleTombstones()
trove, err = ParseTweetResponse(tweet_response)
if err != nil {
panic(err)
}
tweets, retweets, _users, err := ParseTweetResponse(tweet_response)
trove.TombstoneUsers = tombstoned_users
trove.FetchTombstoneUsers()
// Quoted tombstones need their user_id filled out from the tombstoned_users list
for i := range tweets {
if tweets[i].UserID != 0 {
continue
}
handle := tweet_response.GlobalObjects.Tweets[fmt.Sprint(tweets[i].ID)].UserHandle
is_found := false
for _, u := range users { // The tombstoned users, not from the tweet response
if u.Handle == UserHandle(handle) {
tweets[i].UserID = u.ID
is_found = true
break
}
}
if !is_found {
panic("Couldn't find the user handle in the list of tombstoned users!")
}
}
trove.FillMissingUserIDs()
// Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at"
scrape_time := time.Now()
for i, t := range(tweets) {
if t.ID == id {
// Index the slice because `tweets[i]` is a reference, whereas `t` is a copy
tweets[i].LastScrapedAt = scrape_time
tweets[i].IsConversationScraped = true
}
tweet, ok := trove.Tweets[id]
if !ok {
panic("Trove didn't contain its own tweet!")
}
users = append(users, _users...)
tweet.LastScrapedAt = time.Now()
tweet.IsConversationScraped = true
trove.Tweets[id] = tweet
// tweets, retweets, users = trove.Transform()
return
}
@ -280,32 +262,31 @@ func GetTweetFull(id TweetID) (tweets []Tweet, retweets []Retweet, users []User,
*
* returns: a list of tweets, retweets and users in that response object
*/
func ParseTweetResponse(resp TweetResponse) (tweets []Tweet, retweets []Retweet, users []User, err error) {
// TODO: TweetCollection maybe should be its own type
var new_tweet Tweet
var new_retweet Retweet
func ParseTweetResponse(resp TweetResponse) (TweetTrove, error) {
trove := NewTweetTrove()
for _, single_tweet := range resp.GlobalObjects.Tweets {
if single_tweet.RetweetedStatusIDStr == "" {
new_tweet, err = ParseSingleTweet(single_tweet)
new_tweet, err := ParseSingleTweet(single_tweet)
if err != nil {
return
return trove, err
}
tweets = append(tweets, new_tweet)
trove.Tweets[new_tweet.ID] = new_tweet
} else {
new_retweet, err = ParseSingleRetweet(single_tweet)
new_retweet, err := ParseSingleRetweet(single_tweet)
if err != nil {
return
return trove, err
}
retweets = append(retweets, new_retweet)
trove.Retweets[new_retweet.RetweetID] = new_retweet
}
}
var new_user User
for _, user := range resp.GlobalObjects.Users {
new_user, err = ParseSingleUser(user)
new_user, err := ParseSingleUser(user)
if err != nil {
return
return trove, err
}
users = append(users, new_user)
trove.Users[new_user.ID] = new_user
}
return
return trove, nil
}

View File

@ -161,8 +161,9 @@ func TestParseTweetResponse(t *testing.T) {
err = json.Unmarshal(data, &tweet_resp)
require.NoError(t, err)
tweets, retweets, users, err := ParseTweetResponse(tweet_resp)
trove, err := ParseTweetResponse(tweet_resp)
require.NoError(t, err)
tweets, retweets, users := trove.Transform()
assert.Len(tweets, 29 - 3)
assert.Len(retweets, 3)
@ -182,8 +183,9 @@ func TestParseTweetResponseWithTombstones(t *testing.T) {
extra_users := tweet_resp.HandleTombstones()
assert.Len(extra_users, 1)
tweets, retweets, users, err := ParseTweetResponse(tweet_resp)
trove, err := ParseTweetResponse(tweet_resp)
require.NoError(t, err)
tweets, retweets, users := trove.Transform()
assert.Len(tweets, 2)
assert.Len(retweets, 0)

View File

@ -37,6 +37,17 @@ func (trove TweetTrove) Transform() (tweets []Tweet, retweets []Retweet, users [
return
} // TODO: refactor until this function isn't needed anymore
/**
* Search for a user by handle. Second param is whether the user was found or not.
*/
func (trove TweetTrove) FindUserByHandle(handle UserHandle) (User, bool) {
for _, user := range trove.Users {
if user.Handle == handle {
return user, true
}
}
return User{}, false
}
/**
* Combine two troves into one
@ -55,6 +66,30 @@ func (t1 *TweetTrove) MergeWith(t2 TweetTrove) {
t1.TombstoneUsers = append(t1.TombstoneUsers, t2.TombstoneUsers...)
}
/**
* Tries to fetch every User that's been identified in a tombstone in this trove
*/
func (trove *TweetTrove) FetchTombstoneUsers() {
for _, handle := range trove.TombstoneUsers {
// Skip fetching if this user is already in the trove
_, already_fetched := trove.FindUserByHandle(handle)
if already_fetched {
continue
}
user, err := GetUser(handle)
if err != nil {
panic(fmt.Sprintf("Error getting tombstoned user: %s\n %s", handle, err.Error()))
}
if user.ID == 0 {
panic(fmt.Sprintf("UserID == 0 (@%s)", handle))
}
trove.Users[user.ID] = user
}
}
/**
* Checks for tombstoned tweets and fills in their UserIDs based on the collected tombstoned users.

View File

@ -66,3 +66,21 @@ func TestFillMissingUserIDs(t *testing.T) {
assert.Equal(trove.Tweets[2].UserID, UserID(1))
}
func TestFindUserByHandle(t *testing.T) {
assert := assert.New(t)
u1 := User{ID: 1, Handle: "1", DisplayName: "One"}
u2 := User{ID: 2, Handle: "2", DisplayName: "Two"}
trove := NewTweetTrove()
trove.Users[u1.ID] = u1
trove.Users[u2.ID] = u2
user_2, ok := trove.FindUserByHandle("2")
assert.True(ok)
assert.Equal(user_2.DisplayName, "Two")
_, ok = trove.FindUserByHandle("3")
assert.False(ok)
}

View File

@ -14,7 +14,7 @@ import (
*
* returns: a slice of Tweets, Retweets, and Users
*/
func GetUserFeedFor(user_id UserID, min_tweets int) (tweets []Tweet, retweets []Retweet, users []User, err error) {
func GetUserFeedFor(user_id UserID, min_tweets int) (trove TweetTrove, err error) {
api := API{}
tweet_response, err := api.GetFeedFor(user_id, "")
if err != nil {
@ -47,33 +47,14 @@ func GetUserFeedGraphqlFor(user_id UserID, min_tweets int) (trove TweetTrove, er
}
}
trove, err = api_response.ToTweetTrove()
if err != nil {
panic(err)
}
// DUPE tombstone-user-processing
fmt.Println("------------")
for _, handle := range trove.TombstoneUsers {
fmt.Println(handle)
user, err := GetUser(handle)
if err != nil {
panic(err)
}
fmt.Println(user)
if user.ID == 0 {
panic(fmt.Sprintf("UserID == 0 (@%s)", handle))
}
trove.Users[user.ID] = user
}
// Quoted tombstones need their user_id filled out from the tombstoned_users list
trove.FetchTombstoneUsers()
trove.FillMissingUserIDs()
// <<<<<<< DUPE tombstone-user-processing
return trove, nil
}