Refactor all scraper areas to use TweetTroves where applicable
This commit is contained in:
parent
c53d8ae24c
commit
e0f05cbeea
@ -172,10 +172,11 @@ func fetch_tweet_conversation(tweet_identifier string) {
|
||||
fmt.Println("Tweet is already in database. Updating...")
|
||||
}
|
||||
|
||||
tweets, _, users, err := scraper.GetTweetFull(tweet_id)
|
||||
trove, err := scraper.GetTweetFull(tweet_id)
|
||||
if err != nil {
|
||||
die(err.Error(), false, -1)
|
||||
}
|
||||
tweets, _, users := trove.Transform()
|
||||
|
||||
for _, u := range users {
|
||||
fmt.Println(u.Handle)
|
||||
@ -285,10 +286,11 @@ func download_user_content(handle scraper.UserHandle) {
|
||||
|
||||
|
||||
func search(query string) {
|
||||
tweets, retweets, users, err := scraper.Search(query, 1000);
|
||||
trove, err := scraper.Search(query, 1000)
|
||||
if err != nil {
|
||||
die("Error scraping search results: " + err.Error(), false, -100)
|
||||
}
|
||||
tweets, retweets, users := trove.Transform()
|
||||
|
||||
for _, u := range users {
|
||||
fmt.Println(u.Handle)
|
||||
|
@ -13,7 +13,7 @@ func TimestampToDateString(timestamp int) string {
|
||||
* - photos
|
||||
* - videos
|
||||
*/
|
||||
func Search(query string, min_results int) (tweets []Tweet, retweets []Retweet, users []User, err error) {
|
||||
func Search(query string, min_results int) (trove TweetTrove, err error) {
|
||||
api := API{}
|
||||
tweet_response, err := api.Search(query, "")
|
||||
if err != nil {
|
||||
|
@ -213,7 +213,7 @@ func GetTweet(id TweetID) (Tweet, error) {
|
||||
*
|
||||
* returns: the tweet, list of its replies and context, and users associated with those replies
|
||||
*/
|
||||
func GetTweetFull(id TweetID) (tweets []Tweet, retweets []Retweet, users []User, err error) {
|
||||
func GetTweetFull(id TweetID) (trove TweetTrove, err error) {
|
||||
api := API{}
|
||||
tweet_response, err := api.GetTweet(id, "")
|
||||
if err != nil {
|
||||
@ -226,49 +226,31 @@ func GetTweetFull(id TweetID) (tweets []Tweet, retweets []Retweet, users []User,
|
||||
return
|
||||
}
|
||||
}
|
||||
tombstone_users := tweet_response.HandleTombstones()
|
||||
fmt.Printf("%v\n", tombstone_users)
|
||||
for _, u := range tombstone_users {
|
||||
fetched_user, err1 := GetUser(u)
|
||||
fetched_user.Handle = u
|
||||
if err != nil {
|
||||
err = err1
|
||||
return
|
||||
}
|
||||
fmt.Println(fetched_user)
|
||||
users = append(users, fetched_user)
|
||||
|
||||
// This has to be called BEFORE ParseTweetResponse, because it modifies the TweetResponse (adds tombstone tweets to its tweets list)
|
||||
tombstoned_users := tweet_response.HandleTombstones()
|
||||
|
||||
trove, err = ParseTweetResponse(tweet_response)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
tweets, retweets, _users, err := ParseTweetResponse(tweet_response)
|
||||
trove.TombstoneUsers = tombstoned_users
|
||||
trove.FetchTombstoneUsers()
|
||||
|
||||
// Quoted tombstones need their user_id filled out from the tombstoned_users list
|
||||
for i := range tweets {
|
||||
if tweets[i].UserID != 0 {
|
||||
continue
|
||||
}
|
||||
handle := tweet_response.GlobalObjects.Tweets[fmt.Sprint(tweets[i].ID)].UserHandle
|
||||
is_found := false
|
||||
for _, u := range users { // The tombstoned users, not from the tweet response
|
||||
if u.Handle == UserHandle(handle) {
|
||||
tweets[i].UserID = u.ID
|
||||
is_found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !is_found {
|
||||
panic("Couldn't find the user handle in the list of tombstoned users!")
|
||||
}
|
||||
}
|
||||
trove.FillMissingUserIDs()
|
||||
|
||||
// Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at"
|
||||
scrape_time := time.Now()
|
||||
for i, t := range(tweets) {
|
||||
if t.ID == id {
|
||||
// Index the slice because `tweets[i]` is a reference, whereas `t` is a copy
|
||||
tweets[i].LastScrapedAt = scrape_time
|
||||
tweets[i].IsConversationScraped = true
|
||||
}
|
||||
tweet, ok := trove.Tweets[id]
|
||||
if !ok {
|
||||
panic("Trove didn't contain its own tweet!")
|
||||
}
|
||||
users = append(users, _users...)
|
||||
tweet.LastScrapedAt = time.Now()
|
||||
tweet.IsConversationScraped = true
|
||||
trove.Tweets[id] = tweet
|
||||
|
||||
// tweets, retweets, users = trove.Transform()
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
@ -280,32 +262,31 @@ func GetTweetFull(id TweetID) (tweets []Tweet, retweets []Retweet, users []User,
|
||||
*
|
||||
* returns: a list of tweets, retweets and users in that response object
|
||||
*/
|
||||
func ParseTweetResponse(resp TweetResponse) (tweets []Tweet, retweets []Retweet, users []User, err error) {
|
||||
// TODO: TweetCollection maybe should be its own type
|
||||
var new_tweet Tweet
|
||||
var new_retweet Retweet
|
||||
func ParseTweetResponse(resp TweetResponse) (TweetTrove, error) {
|
||||
trove := NewTweetTrove()
|
||||
|
||||
for _, single_tweet := range resp.GlobalObjects.Tweets {
|
||||
if single_tweet.RetweetedStatusIDStr == "" {
|
||||
new_tweet, err = ParseSingleTweet(single_tweet)
|
||||
new_tweet, err := ParseSingleTweet(single_tweet)
|
||||
if err != nil {
|
||||
return
|
||||
return trove, err
|
||||
}
|
||||
tweets = append(tweets, new_tweet)
|
||||
trove.Tweets[new_tweet.ID] = new_tweet
|
||||
} else {
|
||||
new_retweet, err = ParseSingleRetweet(single_tweet)
|
||||
new_retweet, err := ParseSingleRetweet(single_tweet)
|
||||
if err != nil {
|
||||
return
|
||||
return trove, err
|
||||
}
|
||||
retweets = append(retweets, new_retweet)
|
||||
trove.Retweets[new_retweet.RetweetID] = new_retweet
|
||||
}
|
||||
}
|
||||
var new_user User
|
||||
|
||||
for _, user := range resp.GlobalObjects.Users {
|
||||
new_user, err = ParseSingleUser(user)
|
||||
new_user, err := ParseSingleUser(user)
|
||||
if err != nil {
|
||||
return
|
||||
return trove, err
|
||||
}
|
||||
users = append(users, new_user)
|
||||
trove.Users[new_user.ID] = new_user
|
||||
}
|
||||
return
|
||||
return trove, nil
|
||||
}
|
||||
|
@ -161,8 +161,9 @@ func TestParseTweetResponse(t *testing.T) {
|
||||
err = json.Unmarshal(data, &tweet_resp)
|
||||
require.NoError(t, err)
|
||||
|
||||
tweets, retweets, users, err := ParseTweetResponse(tweet_resp)
|
||||
trove, err := ParseTweetResponse(tweet_resp)
|
||||
require.NoError(t, err)
|
||||
tweets, retweets, users := trove.Transform()
|
||||
|
||||
assert.Len(tweets, 29 - 3)
|
||||
assert.Len(retweets, 3)
|
||||
@ -182,8 +183,9 @@ func TestParseTweetResponseWithTombstones(t *testing.T) {
|
||||
extra_users := tweet_resp.HandleTombstones()
|
||||
assert.Len(extra_users, 1)
|
||||
|
||||
tweets, retweets, users, err := ParseTweetResponse(tweet_resp)
|
||||
trove, err := ParseTweetResponse(tweet_resp)
|
||||
require.NoError(t, err)
|
||||
tweets, retweets, users := trove.Transform()
|
||||
|
||||
assert.Len(tweets, 2)
|
||||
assert.Len(retweets, 0)
|
||||
|
@ -37,6 +37,17 @@ func (trove TweetTrove) Transform() (tweets []Tweet, retweets []Retweet, users [
|
||||
return
|
||||
} // TODO: refactor until this function isn't needed anymore
|
||||
|
||||
/**
|
||||
* Search for a user by handle. Second param is whether the user was found or not.
|
||||
*/
|
||||
func (trove TweetTrove) FindUserByHandle(handle UserHandle) (User, bool) {
|
||||
for _, user := range trove.Users {
|
||||
if user.Handle == handle {
|
||||
return user, true
|
||||
}
|
||||
}
|
||||
return User{}, false
|
||||
}
|
||||
|
||||
/**
|
||||
* Combine two troves into one
|
||||
@ -55,6 +66,30 @@ func (t1 *TweetTrove) MergeWith(t2 TweetTrove) {
|
||||
t1.TombstoneUsers = append(t1.TombstoneUsers, t2.TombstoneUsers...)
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to fetch every User that's been identified in a tombstone in this trove
|
||||
*/
|
||||
func (trove *TweetTrove) FetchTombstoneUsers() {
|
||||
for _, handle := range trove.TombstoneUsers {
|
||||
// Skip fetching if this user is already in the trove
|
||||
_, already_fetched := trove.FindUserByHandle(handle)
|
||||
if already_fetched {
|
||||
continue
|
||||
}
|
||||
|
||||
user, err := GetUser(handle)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Error getting tombstoned user: %s\n %s", handle, err.Error()))
|
||||
}
|
||||
|
||||
if user.ID == 0 {
|
||||
panic(fmt.Sprintf("UserID == 0 (@%s)", handle))
|
||||
}
|
||||
|
||||
trove.Users[user.ID] = user
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks for tombstoned tweets and fills in their UserIDs based on the collected tombstoned users.
|
||||
|
||||
|
@ -66,3 +66,21 @@ func TestFillMissingUserIDs(t *testing.T) {
|
||||
|
||||
assert.Equal(trove.Tweets[2].UserID, UserID(1))
|
||||
}
|
||||
|
||||
func TestFindUserByHandle(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
u1 := User{ID: 1, Handle: "1", DisplayName: "One"}
|
||||
u2 := User{ID: 2, Handle: "2", DisplayName: "Two"}
|
||||
|
||||
trove := NewTweetTrove()
|
||||
trove.Users[u1.ID] = u1
|
||||
trove.Users[u2.ID] = u2
|
||||
|
||||
user_2, ok := trove.FindUserByHandle("2")
|
||||
assert.True(ok)
|
||||
assert.Equal(user_2.DisplayName, "Two")
|
||||
|
||||
_, ok = trove.FindUserByHandle("3")
|
||||
assert.False(ok)
|
||||
}
|
||||
|
@ -14,7 +14,7 @@ import (
|
||||
*
|
||||
* returns: a slice of Tweets, Retweets, and Users
|
||||
*/
|
||||
func GetUserFeedFor(user_id UserID, min_tweets int) (tweets []Tweet, retweets []Retweet, users []User, err error) {
|
||||
func GetUserFeedFor(user_id UserID, min_tweets int) (trove TweetTrove, err error) {
|
||||
api := API{}
|
||||
tweet_response, err := api.GetFeedFor(user_id, "")
|
||||
if err != nil {
|
||||
@ -47,33 +47,14 @@ func GetUserFeedGraphqlFor(user_id UserID, min_tweets int) (trove TweetTrove, er
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
trove, err = api_response.ToTweetTrove()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// DUPE tombstone-user-processing
|
||||
fmt.Println("------------")
|
||||
for _, handle := range trove.TombstoneUsers {
|
||||
fmt.Println(handle)
|
||||
|
||||
user, err := GetUser(handle)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
fmt.Println(user)
|
||||
|
||||
if user.ID == 0 {
|
||||
panic(fmt.Sprintf("UserID == 0 (@%s)", handle))
|
||||
}
|
||||
|
||||
trove.Users[user.ID] = user
|
||||
}
|
||||
// Quoted tombstones need their user_id filled out from the tombstoned_users list
|
||||
trove.FetchTombstoneUsers()
|
||||
trove.FillMissingUserIDs()
|
||||
|
||||
// <<<<<<< DUPE tombstone-user-processing
|
||||
|
||||
return trove, nil
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user