diff --git a/scraper/api_types_v2.go b/scraper/api_types_v2.go index 876a373..e7f1591 100644 --- a/scraper/api_types_v2.go +++ b/scraper/api_types_v2.go @@ -2,6 +2,7 @@ package scraper import ( "encoding/json" + "errors" "fmt" "net/url" "strings" @@ -9,6 +10,8 @@ import ( log "github.com/sirupsen/logrus" ) +var ErrorIsTombstone = errors.New("tweet is a tombstone") + type CardValue struct { Type string `json:"type"` StringValue string `json:"string_value"` @@ -169,13 +172,14 @@ type APIV2Result struct { } `json:"result"` } -func (api_result APIV2Result) ToTweetTrove(ignore_null_entries bool) TweetTrove { +func (api_result APIV2Result) ToTweetTrove() (TweetTrove, error) { ret := NewTweetTrove() // Start by checking if this is a null entry in a feed - if api_result.Result.Tombstone != nil && ignore_null_entries { - // TODO: this is becoming really spaghetti. Why do we need a separate execution path for this? - return ret + if api_result.Result.Tombstone != nil { + // Returning an error indicates the parent (APIV2Entry) has to parse it as a tombstone. + // The tweet ID isn't available to the APIV2Result, but it is to the APIV2Entry. + return ret, ErrorIsTombstone } if api_result.Result.Legacy.ID == 0 && api_result.Result.Tweet.Legacy.ID != 0 { @@ -207,25 +211,42 @@ func (api_result APIV2Result) ToTweetTrove(ignore_null_entries bool) TweetTrove // Handle quoted tweet if api_result.Result.QuotedStatusResult != nil { quoted_api_result := api_result.Result.QuotedStatusResult + quoted_trove, err := quoted_api_result.ToTweetTrove() - // Quoted tweets might be tombstones! - if quoted_api_result.Result.Tombstone != nil { - tombstoned_tweet := "ed_api_result.Result.Legacy.APITweet - var ok bool - tombstoned_tweet.TombstoneText, ok = tombstone_types[quoted_api_result.Result.Tombstone.Text.Text] - if !ok { + // Quoted tombstones can be handled here since we already have the ID and user handle + if errors.Is(err, ErrorIsTombstone) { + tombstoned_tweet := quoted_api_result.Result.Legacy.APITweet + + // Capture the tombstone text + var is_ok bool + tombstoned_tweet.TombstoneText, is_ok = tombstone_types[quoted_api_result.Result.Tombstone.Text.Text] + if !is_ok { panic(fmt.Errorf("Unknown tombstone text %q:\n %w", quoted_api_result.Result.Tombstone.Text.Text, EXTERNAL_API_ERROR)) } + + // Capture the tombstone ID tombstoned_tweet.ID = int64(int_or_panic(api_result.Result.Legacy.APITweet.QuotedStatusIDStr)) + + // Capture the tombstone's user handle handle, err := ParseHandleFromTweetUrl(api_result.Result.Legacy.APITweet.QuotedStatusPermalink.ExpandedURL) if err != nil { panic(err) } tombstoned_tweet.UserHandle = string(handle) + + // Parse the tombstone into a Tweet and add it to the trove + parsed_tombstone_tweet, err := ParseSingleTweet(tombstoned_tweet) + if err != nil { + panic(err) + } + ret.Tweets[parsed_tombstone_tweet.ID] = parsed_tombstone_tweet + + // Add the user as a tombstoned user to be fetched later ret.TombstoneUsers = append(ret.TombstoneUsers, handle) + } else if err != nil { + panic(err) } - quoted_trove := quoted_api_result.ToTweetTrove(false) ret.MergeWith(quoted_trove) } @@ -235,8 +256,8 @@ func (api_result APIV2Result) ToTweetTrove(ignore_null_entries bool) TweetTrove if api_result.Result.Legacy.RetweetedStatusResult == nil { // We have to filter out retweets. For some reason, retweets have a copy of the card in both the retweeting // and the retweeted TweetResults; it should only be parsed for the real Tweet, not the Retweet - main_tweet, ok := ret.Tweets[TweetID(api_result.Result.Legacy.ID)] - if !ok { + main_tweet, is_ok := ret.Tweets[TweetID(api_result.Result.Legacy.ID)] + if !is_ok { panic(fmt.Errorf("Tweet trove didn't contain its own tweet with ID %d:\n %w", api_result.Result.Legacy.ID, EXTERNAL_API_ERROR)) } if api_result.Result.Card.Legacy.Name == "summary_large_image" || api_result.Result.Card.Legacy.Name == "player" { @@ -284,7 +305,7 @@ func (api_result APIV2Result) ToTweetTrove(ignore_null_entries bool) TweetTrove } } - return ret + return ret, nil } type APIV2Tweet struct { @@ -299,11 +320,13 @@ func (api_v2_tweet APIV2Tweet) ToTweetTrove() TweetTrove { // If there's a retweet, we ignore the main tweet except for posted_at and retweeting UserID if api_v2_tweet.RetweetedStatusResult != nil { - orig_tweet_trove := api_v2_tweet.RetweetedStatusResult.ToTweetTrove(false) + orig_tweet_trove, err := api_v2_tweet.RetweetedStatusResult.ToTweetTrove() + if err != nil { + panic(err) + } ret.MergeWith(orig_tweet_trove) retweet := Retweet{} - var err error retweet.RetweetID = TweetID(api_v2_tweet.ID) if api_v2_tweet.RetweetedStatusResult.Result.Legacy.ID == 0 && api_v2_tweet.RetweetedStatusResult.Result.Tweet.Legacy.ID != 0 { @@ -375,7 +398,7 @@ func (e *APIV2Entry) UnmarshalJSON(data []byte) error { return nil } -func (e APIV2Entry) ToTweetTrove(ignore_null_entries bool) TweetTrove { +func (e APIV2Entry) ToTweetTrove() TweetTrove { defer func() { if obj := recover(); obj != nil { log.Warn(fmt.Sprintf("Panic while decoding entry: %s\n", e.OriginalJSON)) @@ -400,7 +423,11 @@ func (e APIV2Entry) ToTweetTrove(ignore_null_entries bool) TweetTrove { // "Show More" replies button in a thread on Tweet Detail page continue } - ret.MergeWith(item.Item.ItemContent.TweetResults.ToTweetTrove(ignore_null_entries)) + trove, err := item.Item.ItemContent.TweetResults.ToTweetTrove() + if err != nil { + panic(err) + } + ret.MergeWith(trove) } case "whoToFollow", "TopicsModule", "tweetdetailrelatedtweets": @@ -414,7 +441,35 @@ func (e APIV2Entry) ToTweetTrove(ignore_null_entries bool) TweetTrove { return ret } else if e.Content.EntryType == "TimelineTimelineItem" { - return e.Content.ItemContent.TweetResults.ToTweetTrove(ignore_null_entries) + ret, err := e.Content.ItemContent.TweetResults.ToTweetTrove() + + if errors.Is(err, ErrorIsTombstone) { + // Handle tombstones + ret = NewTweetTrove() // clear the result just in case + tombstoned_tweet := e.Content.ItemContent.TweetResults.Result.Legacy.APITweet // Will be empty to start + + // Capture the tombstone text + var is_ok bool + tombstoned_tweet.TombstoneText, is_ok = tombstone_types[e.Content.ItemContent.TweetResults.Result.Tombstone.Text.Text] + if !is_ok { + panic(fmt.Errorf( + "Unknown tombstone text %q:\n %w", + e.Content.ItemContent.TweetResults.Result.Tombstone.Text.Text, + EXTERNAL_API_ERROR, + )) + } + + // Capture the tombstone ID + tombstoned_tweet.ID = int64(int_or_panic(strings.Split(e.EntryID, "-")[1])) + + // Parse the tombstone into a Tweet and add it to the trove + parsed_tombstone_tweet, err := ParseSingleTweet(tombstoned_tweet) + if err != nil { + panic(err) + } + ret.Tweets[parsed_tombstone_tweet.ID] = parsed_tombstone_tweet + } + return ret } panic("Unknown EntryType: " + e.Content.EntryType) } @@ -489,11 +544,58 @@ func (api_response APIV2Response) IsEmpty() bool { */ func (api_response APIV2Response) ToTweetTrove() (TweetTrove, error) { ret := NewTweetTrove() + + // Parse all of the entries for _, entry := range api_response.GetMainInstruction().Entries { // TODO: the second Instruction is the pinned tweet - main_trove := entry.ToTweetTrove(true) + main_trove := entry.ToTweetTrove() ret.MergeWith(main_trove) } + // Add in any tombstoned user handles and IDs if possible, by reading from the replies + for _, tweet := range ret.Tweets { + // Skip if it's not a reply (nothing to add) + if tweet.InReplyToID == 0 { + continue + } + + // Skip if the replied tweet isn't in the result set (e.g., the reply is a quoted tweet) + replied_tweet, is_ok := ret.Tweets[tweet.InReplyToID] + if !is_ok { + continue + } + + // Skip if the replied tweet isn't a stub (it's already filled out) + if !replied_tweet.IsStub { + continue + } + + if replied_tweet.ID == 0 { + // Not sure if this can happen. Use a panic to detect if it does so we can analyse + // TODO: make a better system to capture "discovery panics" that doesn't involve panicking + panic(fmt.Sprintf("Tombstoned tweet has no ID (should be %d)", tweet.InReplyToID)) + } + + if replied_tweet.UserID == 0 { + replied_tweet.UserID = tweet.in_reply_to_user_id + if replied_tweet.UserID == 0 { // Still?? + log.Warn(fmt.Sprintf("Still couldn't find user for replied tweet %d", tweet.InReplyToID)) + continue + } + } // replied_tweet.UserID should now be a real UserID + + existing_user, is_ok := ret.Users[replied_tweet.UserID] + if !is_ok { + existing_user = User{ID: replied_tweet.UserID} + } + if existing_user.Handle == "" { + existing_user.Handle = tweet.in_reply_to_user_handle + } + ret.Users[replied_tweet.UserID] = existing_user + // TODO: add to ret.TombstonedUsers? + + ret.Tweets[replied_tweet.ID] = replied_tweet + } + return ret, nil // TODO: This doesn't need to return an error, it's always nil } diff --git a/scraper/api_types_v2_test.go b/scraper/api_types_v2_test.go index 9254b48..414b6bb 100644 --- a/scraper/api_types_v2_test.go +++ b/scraper/api_types_v2_test.go @@ -63,7 +63,8 @@ func TestAPIV2ParseTweet(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove(true) + trove, err := tweet_result.ToTweetTrove() + assert.NoError(err) assert.Equal(1, len(trove.Tweets)) tweet, ok := trove.Tweets[1485708879174508550] @@ -111,7 +112,8 @@ func TestAPIV2ParseTweetWithQuotedTweet(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove(true) + trove, err := tweet_result.ToTweetTrove() + assert.NoError(err) // Should be 2 tweets: quote-tweet and quoted-tweet assert.Equal(2, len(trove.Tweets)) @@ -165,7 +167,8 @@ func TestAPIV2ParseRetweet(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove(true) + trove, err := tweet_result.ToTweetTrove() + assert.NoError(err) // Should only be 1 tweet, the retweeted one assert.Equal(1, len(trove.Tweets)) @@ -224,7 +227,8 @@ func TestAPIV2ParseRetweetedQuoteTweet(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove(true) + trove, err := tweet_result.ToTweetTrove() + assert.NoError(err) // Quoted tweet and quoting tweet assert.Equal(2, len(trove.Tweets)) @@ -283,7 +287,8 @@ func TestAPIV2ParseTweetWithQuotedTombstone(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove(true) + trove, err := tweet_result.ToTweetTrove() + assert.NoError(err) assert.Equal(1, len(trove.Users)) user, ok := trove.Users[44067298] @@ -318,7 +323,8 @@ func TestAPIV2ParseTweetWithURL(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove(true) + trove, err := tweet_result.ToTweetTrove() + assert.NoError(err) assert.Equal(1, len(trove.Tweets)) tweet, ok := trove.Tweets[1485695695025803264] @@ -355,7 +361,8 @@ func TestAPIV2ParseTweetWithURLPlayerCard(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove(true) + trove, err := tweet_result.ToTweetTrove() + assert.NoError(err) assert.Equal(1, len(trove.Tweets)) tweet, ok := trove.Tweets[1485504913614327808] @@ -387,7 +394,8 @@ func TestAPIV2ParseTweetWithURLRetweet(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove(true) + trove, err := tweet_result.ToTweetTrove() + assert.NoError(err) assert.Equal(1, len(trove.Tweets)) tweet, ok := trove.Tweets[1488605073588559873] @@ -414,7 +422,8 @@ func TestAPIV2ParseTweetWithPoll(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove(true) + trove, err := tweet_result.ToTweetTrove() + assert.NoError(err) assert.Len(trove.Tweets, 1) tweet, ok := trove.Tweets[1485692111106285571] @@ -454,7 +463,8 @@ func TestAPIV2ParseTweetWithSpace(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove(true) + trove, err := tweet_result.ToTweetTrove() + assert.NoError(err) assert.Len(trove.Tweets, 1) tweet, ok := trove.Tweets[1497647006445146113] @@ -580,26 +590,68 @@ func TestAPIV2GetMainInstructionFromFeed(t *testing.T) { assert.Equal(feed.GetMainInstruction().Entries[41].EntryID, "asdf") } -/** - * Should handle an entry in the feed that's a tombstone by just ignoring it - * Expectation: random tombstones in the feed with no context should parse as empty TweetTroves. - * - * The indication that it's from a feed (i.e., not in a comments thread) is 'ToTweetTrove(true)'. - * On a reply thread, it would be 'ToTweetTrove(false)'. - */ func TestAPIV2TombstoneEntry(t *testing.T) { assert := assert.New(t) data, err := os.ReadFile("test_responses/api_v2/tombstone_tweet.json") require.NoError(t, err) - var tweet_result APIV2Result - err = json.Unmarshal(data, &tweet_result) + var entry APIV2Entry + err = json.Unmarshal(data, &entry) require.NoError(t, err) - trove := tweet_result.ToTweetTrove(true) // 'true' indicates to ignore empty entries - assert.Len(trove.Tweets, 0) + trove := entry.ToTweetTrove() + assert.NoError(err) + assert.Len(trove.Tweets, 1) assert.Len(trove.Users, 0) assert.Len(trove.Retweets, 0) + + tweet, is_ok := trove.Tweets[1454515503242829830] + assert.True(is_ok) + assert.Equal(tweet.ID, TweetID(1454515503242829830)) +} + +func TestAPIV2ConversationThreadWithTombstones(t *testing.T) { + assert := assert.New(t) + data, err := os.ReadFile("test_responses/api_v2/conversation_thread_with_tombstones.json") + require.NoError(t, err) + + var resp APIV2Response + err = json.Unmarshal(data, &resp) + require.NoError(t, err) + + trove, err := resp.ToTweetTrove() + require.NoError(t, err) + + assert.Len(trove.Tweets, 4) + // t1, is_ok := trove.Tweets[1454515503242829830] + // assert.True(is_ok) + // assert.True(t1.IsStub) + // assert.Equal(TweetID(0), t1.InReplyToID) + // // TODO: assert associated user is fake + + // t2, is_ok := trove.Tweets[1454521424144654344] + // assert.True(is_ok) + // assert.True(t2.IsStub) + // assert.Equal(TweetID(1454515503242829830), t2.InReplyToID) + + t3, is_ok := trove.Tweets[1454522147750260742] + assert.True(is_ok) + assert.True(t3.IsStub) + // assert.Equal(TweetID(1454521424144654344), t3.InReplyToID) + assert.Equal(UserID(1365863538393309184), t3.UserID) + t3_user, is_ok := trove.Users[t3.UserID] + assert.True(is_ok) + assert.Equal(UserHandle("itsbackwereover"), t3_user.Handle) + + t4, is_ok := trove.Tweets[1454526270809726977] + assert.True(is_ok) + assert.False(t4.IsStub) + assert.Equal(TweetID(1454522147750260742), t4.InReplyToID) + _, is_ok = trove.Users[t4.UserID] + assert.True(is_ok) + + // assert.Len(trove.Users, 4) + assert.Len(trove.Retweets, 0) } func TestTweetWithWarning(t *testing.T) { @@ -610,7 +662,8 @@ func TestTweetWithWarning(t *testing.T) { err = json.Unmarshal(data, &tweet_result) require.NoError(t, err) - trove := tweet_result.ToTweetTrove(true) + trove, err := tweet_result.ToTweetTrove() + assert.NoError(err) assert.Len(trove.Retweets, 1) assert.Len(trove.Tweets, 2) @@ -626,7 +679,8 @@ func TestRetweetWithVisibilityResults(t *testing.T) { err = json.Unmarshal(data, &tweet_result) require.NoError(err) - trove := tweet_result.ToTweetTrove(true) + trove, err := tweet_result.ToTweetTrove() + assert.NoError(err) assert.Len(trove.Retweets, 1) assert.Len(trove.Tweets, 1) @@ -646,7 +700,8 @@ func TestExpandableTweet(t *testing.T) { err = json.Unmarshal(data, &tweet_result) require.NoError(err) - trove := tweet_result.ToTweetTrove(true) + trove, err := tweet_result.ToTweetTrove() + assert.NoError(err) main_tweet, is_ok := trove.Tweets[TweetID(1649600354747572225)] require.True(is_ok) @@ -665,7 +720,7 @@ func TestEntryWithConversationThread(t *testing.T) { err = json.Unmarshal(data, &entry_result) require.NoError(err) - trove := entry_result.ToTweetTrove(true) + trove := entry_result.ToTweetTrove() assert.Len(trove.Tweets, 4) // 3 tweets in the thread plus the quoted tweet t1, is_ok := trove.Tweets[1624966566264680448] @@ -693,7 +748,7 @@ func TestConversationThreadEntryWithShowMoreButton(t *testing.T) { err = json.Unmarshal(data, &entry_result) require.NoError(err) - trove := entry_result.ToTweetTrove(true) + trove := entry_result.ToTweetTrove() assert.Len(trove.Tweets, 1) t1, is_ok := trove.Tweets[1649803385485377536] diff --git a/scraper/test_responses/api_v2/tombstone_tweet.json b/scraper/test_responses/api_v2/tombstone_tweet.json index 1a9d3ba..28bfb01 100644 --- a/scraper/test_responses/api_v2/tombstone_tweet.json +++ b/scraper/test_responses/api_v2/tombstone_tweet.json @@ -1 +1 @@ -{"result":{"__typename":"TweetTombstone","tombstone":{"__typename":"TextTombstone","text":{"rtl":false,"text":"This Tweet was deleted by the Tweet author. Learn more","entities":[{"fromIndex":44,"toIndex":54,"ref":{"type":"TimelineUrl","url":"https://help.twitter.com/rules-and-policies/notices-on-twitter","urlType":"ExternalUrl"}}]}}}} +{"entryId":"tweet-1454515503242829830","sortIndex":"7768856533611945977","content":{"entryType":"TimelineTimelineItem","__typename":"TimelineTimelineItem","itemContent":{"itemType":"TimelineTweet","__typename":"TimelineTweet","tweet_results":{"result":{"__typename":"TweetTombstone","tombstone":{"__typename":"TextTombstone","text":{"rtl":false,"text":"You’re unable to view this Tweet because this account owner limits who can view their Tweets. Learn more","entities":[{"fromIndex":94,"toIndex":104,"ref":{"type":"TimelineUrl","url":"https://help.twitter.com/rules-and-policies/notices-on-twitter","urlType":"ExternalUrl"}}]}}}},"tweetDisplayType":"Tweet","hasModeratedReplies":false}}} diff --git a/scraper/tweet.go b/scraper/tweet.go index 7548806..c21be96 100644 --- a/scraper/tweet.go +++ b/scraper/tweet.go @@ -33,9 +33,8 @@ func (l CommaSeparatedList) Value() (driver.Value, error) { } type Tweet struct { - ID TweetID `db:"id"` - UserID UserID `db:"user_id"` - UserHandle UserHandle // For processing tombstones + ID TweetID `db:"id"` + UserID UserID `db:"user_id"` User *User Text string `db:"text"` IsExpandable bool `db:"is_expandable"` @@ -47,6 +46,11 @@ type Tweet struct { InReplyToID TweetID `db:"in_reply_to_id"` QuotedTweetID TweetID `db:"quoted_tweet_id"` + // For processing tombstones + UserHandle UserHandle + in_reply_to_user_handle UserHandle + in_reply_to_user_id UserID + Images []Image Videos []Video Urls []Url @@ -221,6 +225,10 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { ret.LastScrapedAt = TimestampFromUnix(0) // Caller will change this for the tweet that was actually scraped ret.IsConversationScraped = false // Safe due to the "No Worsening" principle + // Extra data that can help piece together tombstoned tweet info + ret.in_reply_to_user_id = UserID(apiTweet.InReplyToUserID) + ret.in_reply_to_user_handle = UserHandle(apiTweet.InReplyToScreenName) + return }