From d95f183e45dd73b35b5ea455fdf703f3f8a5210b Mon Sep 17 00:00:00 2001 From: Alessio Date: Fri, 7 Jan 2022 13:42:00 -0500 Subject: [PATCH] Add parsing of tombstoned quoted-tweets --- cmd/tests.sh | 2 +- scraper/api_types.go | 36 +++++++++++++++++++++++++++++++++--- scraper/tweet.go | 22 +++++++++++++++++++++- 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/cmd/tests.sh b/cmd/tests.sh index c88bd14..da41599 100755 --- a/cmd/tests.sh +++ b/cmd/tests.sh @@ -102,7 +102,7 @@ test $(sqlite3 twitter.db "select handle from tweets join users on tweets.user_i test $(sqlite3 twitter.db "select is_conversation_scraped, abs(last_scraped_at - strftime('%s','now')) < 30 from tweets where id = 1429585423702052867") = "1|1" test $(sqlite3 twitter.db "select handle from tweets join users on tweets.user_id = users.id where tweets.id=1429584239570391042") = "michaelmalice" test $(sqlite3 twitter.db "select is_conversation_scraped from tweets where id = 1429584239570391042") = "0" -# test $(sqlite3 twitter.db "select handle from tweets join users on tweets.user_id = users.id where tweets.id=1429583672827465730") = "kanesays23" TODO: this guy got banned +test "$(sqlite3 twitter.db "select handle, is_banned from tweets join users on tweets.user_id = users.id where tweets.id=1429583672827465730")" = "kanesays23|1" # This guy got banned test $(sqlite3 twitter.db "select handle from tweets join users on tweets.user_id = users.id where tweets.id=1429616911315345414") = "NovaValentis" test $(sqlite3 twitter.db "select reply_mentions from tweets where id = 1429585423702052867") = "michaelmalice" test $(sqlite3 twitter.db "select reply_mentions from tweets where id = 1429616911315345414") = "RememberAfghan1,michaelmalice" diff --git a/scraper/api_types.go b/scraper/api_types.go index 90e0092..15eb25f 100644 --- a/scraper/api_types.go +++ b/scraper/api_types.go @@ -168,6 +168,7 @@ type APITweet struct { } `json:"quoted_status_permalink"` Time time.Time `json:"time"` UserID int64 `json:"user_id_str,string"` + UserHandle string Card APICard `json:"card"` TombstoneText string } @@ -326,9 +327,37 @@ var tombstone_types = map[string]string{ * Insert tweets into GlobalObjects for each tombstone. Returns a list of users that need to * be fetched for tombstones. */ -func (t *TweetResponse) HandleTombstones() []string { - ret := []string{} +func (t *TweetResponse) HandleTombstones() []UserHandle { + ret := []UserHandle{} + // Handle tombstones in quote-tweets + for _, api_tweet := range t.GlobalObjects.Tweets { + // Ignore if tweet doesn't have a quoted tweet + if api_tweet.QuotedStatusIDStr == "" { + continue + } + // Ignore if quoted tweet is in the Global Objects (i.e., not a tombstone) + if _, ok := t.GlobalObjects.Tweets[api_tweet.QuotedStatusIDStr]; ok { + continue + } + + user_handle, err := ParseHandleFromTweetUrl(api_tweet.QuotedStatusPermalink.ExpandedURL) + if err != nil { + panic(err) + } + + var tombstoned_tweet APITweet + tombstoned_tweet.ID = int64(int_or_panic(api_tweet.QuotedStatusIDStr)) + tombstoned_tweet.UserHandle = string(user_handle) + tombstoned_tweet.TombstoneText = "unavailable" + + ret = append(ret, user_handle) + fmt.Printf("Adding quoted tombstoned tweet: TweetID %d, handle %q\n", tombstoned_tweet.ID, tombstoned_tweet.UserHandle) + + t.GlobalObjects.Tweets[api_tweet.QuotedStatusIDStr] = tombstoned_tweet + } + + // Handle tombstones in the conversation flow entries := t.Timeline.Instructions[0].AddEntries.Entries sort.Sort(entries) for i, entry := range entries { @@ -344,7 +373,7 @@ func (t *TweetResponse) HandleTombstones() []string { } tombstoned_tweet.ID = api_tweet.InReplyToStatusID tombstoned_tweet.UserID = api_tweet.InReplyToUserID - ret = append(ret, api_tweet.InReplyToScreenName) + ret = append(ret, UserHandle(api_tweet.InReplyToScreenName)) } if i - 1 >= 0 && entries[i-1].Content.Item.Content.Tweet.ID != 0 { prev_tweet_id := entries[i-1].Content.Item.Content.Tweet.ID @@ -365,6 +394,7 @@ func (t *TweetResponse) HandleTombstones() []string { t.GlobalObjects.Tweets[fmt.Sprint(tombstoned_tweet.ID)] = tombstoned_tweet } } + return ret } diff --git a/scraper/tweet.go b/scraper/tweet.go index 1805552..b94ca67 100644 --- a/scraper/tweet.go +++ b/scraper/tweet.go @@ -226,7 +226,8 @@ func GetTweetFull(id TweetID) (tweets []Tweet, retweets []Retweet, users []User, tombstone_users := tweet_response.HandleTombstones() fmt.Printf("%v\n", tombstone_users) for _, u := range tombstone_users { - fetched_user, err1 := GetUser(UserHandle(u)) + fetched_user, err1 := GetUser(u) + fetched_user.Handle = u if err != nil { err = err1 return @@ -236,6 +237,25 @@ func GetTweetFull(id TweetID) (tweets []Tweet, retweets []Retweet, users []User, } tweets, retweets, _users, err := ParseTweetResponse(tweet_response) + // Quoted tombstones need their user_id filled out from the tombstoned_users list + for i, _ := range tweets { + if tweets[i].UserID != 0 { + continue + } + handle := tweet_response.GlobalObjects.Tweets[fmt.Sprint(tweets[i].ID)].UserHandle + is_found := false + for _, u := range users { // The tombstoned users, not from the tweet response + if u.Handle == UserHandle(handle) { + tweets[i].UserID = u.ID + is_found = true + break + } + } + if !is_found { + panic("Couldn't find the user handle in the list of tombstoned users!") + } + } + // Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at" scrape_time := time.Now() for i, t := range(tweets) {