From d8b8aaea15a60dd7891da503bd6d419058904596 Mon Sep 17 00:00:00 2001 From: Alessio Date: Wed, 21 Jun 2023 15:38:18 -0300 Subject: [PATCH] Fix a bug where replies to deleted accounts can fail to do proper reply-joining --- scraper/api_types_v2.go | 7 +++-- scraper/api_types_v2_test.go | 28 +++++++++++++++++++ ...tail_with_unjoined_nontombstone_tweet.json | 1 + 3 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 scraper/test_responses/api_v2/tweet_detail_with_unjoined_nontombstone_tweet.json diff --git a/scraper/api_types_v2.go b/scraper/api_types_v2.go index 58871da..9366c0c 100644 --- a/scraper/api_types_v2.go +++ b/scraper/api_types_v2.go @@ -589,8 +589,9 @@ func (api_response APIV2Response) ToTweetTrove() (TweetTrove, error) { // Infer "in_reply_to_id" for tombstoned tweets from the order of entries, if applicable if entry.Content.EntryType == "TimelineTimelineItem" { entry_type, main_tweet_id := entry.ParseID() - if entry_type == "cursor-showmorethreadsprompt" || entry_type == "cursor-bottom" || entry_type == "cursor-showmorethreads" { + if entry_type == "cursor-showmorethreadsprompt" || entry_type == "cursor-bottom" || entry_type == "cursor-showmorethreads" || entry_type == "cursor-top" { // Skip cursors + // - "cursor-top" => So far, the only top-cursor type there is // - "cursor-bottom" => auto-loads more replies when you scroll it into view // - "cursor-showmorethreadsprompt" => "Show additional replies, including those that may contain offensive content" button // - "cursor-showmorethreads" => "Show more replies" button @@ -609,8 +610,8 @@ func (api_response APIV2Response) ToTweetTrove() (TweetTrove, error) { // On a User Feed, the entry ID could also be a Retweet ID, but we should only reply-join on Tweet Detail views. panic(fmt.Sprintf("Entry didn't parse correctly: %q", entry.EntryID)) } - if !main_tweet.IsStub || main_tweet.InReplyToID != TweetID(0) { - // Not a tombstone; ignore + if main_tweet.InReplyToID != TweetID(0) { + // Already has an InReplyToID, so doesn't need to be joined using positional inference continue } _, prev_entry_id := api_response.GetMainInstruction().Entries[i-1].ParseID() diff --git a/scraper/api_types_v2_test.go b/scraper/api_types_v2_test.go index 01ff63a..1f59cb5 100644 --- a/scraper/api_types_v2_test.go +++ b/scraper/api_types_v2_test.go @@ -816,3 +816,31 @@ func TestConversationThreadWithTombstoneReplies(t *testing.T) { assert.True(is_ok) assert.False(t1.IsStub) } + +func TestTweetDetailWithUnjoinedNontombstoneTweet(t *testing.T) { + assert := assert.New(t) + require := require.New(t) + data, err := os.ReadFile("test_responses/api_v2/tweet_detail_with_unjoined_nontombstone_tweet.json") + require.NoError(err) + var resp APIV2Response + err = json.Unmarshal(data, &resp) + require.NoError(err) + + trove, err := resp.ToTweetTrove() + require.NoError(err) + + assert.Len(trove.Tweets, 3) + t1, is_ok := trove.Tweets[1481999034328006662] + assert.True(is_ok) + assert.True(t1.IsStub) + + t2, is_ok := trove.Tweets[1481999536918831107] + assert.True(is_ok) + assert.True(t2.IsStub) + assert.Equal(t1.ID, t2.InReplyToID) + + t3, is_ok := trove.Tweets[1482000048447705090] // Main tweet + assert.True(is_ok) + assert.False(t3.IsStub) + assert.Equal(t2.ID, t3.InReplyToID) +} diff --git a/scraper/test_responses/api_v2/tweet_detail_with_unjoined_nontombstone_tweet.json b/scraper/test_responses/api_v2/tweet_detail_with_unjoined_nontombstone_tweet.json new file mode 100644 index 0000000..fabe046 --- /dev/null +++ b/scraper/test_responses/api_v2/tweet_detail_with_unjoined_nontombstone_tweet.json @@ -0,0 +1 @@ +{"data":{"threaded_conversation_with_injections_v2":{"instructions":[{"type":"TimelineAddEntries","entries":[{"entryId":"tweet-1481999034328006662","sortIndex":"7741373002526769145","content":{"entryType":"TimelineTimelineItem","__typename":"TimelineTimelineItem","itemContent":{"itemType":"TimelineTweet","__typename":"TimelineTweet","tweet_results":{"result":{"__typename":"TweetTombstone","tombstone":{"__typename":"TextTombstone","text":{"rtl":false,"text":"This Tweet was deleted by the Tweet author. Learn more","entities":[{"fromIndex":44,"toIndex":54,"ref":{"type":"TimelineUrl","url":"https://help.twitter.com/rules-and-policies/notices-on-twitter","urlType":"ExternalUrl"}}]}}}},"tweetDisplayType":"Tweet"}}},{"entryId":"tweet-1481999536918831107","sortIndex":"7741372499935944700","content":{"entryType":"TimelineTimelineItem","__typename":"TimelineTimelineItem","itemContent":{"itemType":"TimelineTweet","__typename":"TimelineTweet","tweet_results":{"result":{"__typename":"TweetTombstone","tombstone":{"__typename":"TextTombstone","text":{"rtl":false,"text":"This Tweet was deleted by the Tweet author. Learn more","entities":[{"fromIndex":44,"toIndex":54,"ref":{"type":"TimelineUrl","url":"https://help.twitter.com/rules-and-policies/notices-on-twitter","urlType":"ExternalUrl"}}]}}}},"tweetDisplayType":"Tweet"}}},{"entryId":"tweet-1482000048447705090","sortIndex":"7741371988407070717","content":{"entryType":"TimelineTimelineItem","__typename":"TimelineTimelineItem","itemContent":{"itemType":"TimelineTweet","__typename":"TimelineTweet","tweet_results":{"result":{"__typename":"Tweet","rest_id":"1482000048447705090","has_birdwatch_notes":false,"core":{"user_results":{"result":{"__typename":"User","id":"VXNlcjoxMjcwMDY4ODU4NzgzNzgwODY5","rest_id":"1270068858783780869","affiliates_highlighted_label":{},"is_blue_verified":false,"profile_image_shape":"Circle","legacy":{"created_at":"Mon Jun 08 19:03:19 +0000 2020","default_profile":true,"default_profile_image":false,"description":"old soul","entities":{"description":{"urls":[]}},"fast_followers_count":0,"favourites_count":5046,"followers_count":105,"friends_count":608,"has_custom_timelines":true,"is_translator":false,"listed_count":2,"location":"","media_count":126,"name":"mas","normal_followers_count":105,"pinned_tweet_ids_str":[],"possibly_sensitive":false,"profile_banner_url":"https://pbs.twimg.com/profile_banners/1270068858783780869/1611353637","profile_image_url_https":"https://pbs.twimg.com/profile_images/1352741312022405120/SBojrkhx_normal.jpg","profile_interstitial_type":"","screen_name":"masforshort","statuses_count":2820,"translator_type":"none","verified":false,"withheld_in_countries":[]}}}},"edit_control":{"edit_tweet_ids":["1482000048447705090"],"editable_until_msecs":"1642173120436","is_edit_eligible":true,"edits_remaining":"5"},"is_translatable":false,"views":{"state":"Enabled"},"source":"Twitter for iPhone","legacy":{"bookmark_count":0,"bookmarked":false,"created_at":"Fri Jan 14 14:42:00 +0000 2022","conversation_id_str":"1481797871389351936","display_text_range":[17,126],"entities":{"user_mentions":[],"urls":[],"hashtags":[],"symbols":[]},"favorite_count":4,"favorited":false,"full_text":"@HazardHarringto Mental health day aka googling what web3 means and seeing if a job there is sexier/more radical than big tech","is_quote_status":false,"lang":"en","quote_count":0,"reply_count":0,"retweet_count":0,"retweeted":false,"user_id_str":"1270068858783780869","id_str":"1482000048447705090"},"quick_promote_eligibility":{"eligibility":"IneligibleUserUnauthorized"}}},"tweetDisplayType":"Tweet"}}},{"entryId":"cursor-top-6320090580398366857","sortIndex":"7741553747501277184","content":{"entryType":"TimelineTimelineItem","__typename":"TimelineTimelineItem","itemContent":{"itemType":"TimelineTimelineCursor","__typename":"TimelineTimelineCursor","value":"DwAAAPAAHCaAgL3pwvi8kCk1AgAA","cursorType":"Top"}}}]}]}}}