diff --git a/scraper/api_types_v2.go b/scraper/api_types_v2.go index 98f37e7..cbb0fe5 100644 --- a/scraper/api_types_v2.go +++ b/scraper/api_types_v2.go @@ -557,10 +557,48 @@ func (api_response APIV2Response) IsEmpty() bool { func (api_response APIV2Response) ToTweetTrove() (TweetTrove, error) { ret := NewTweetTrove() - // Parse all of the entries - for _, entry := range api_response.GetMainInstruction().Entries { // TODO: the second Instruction is the pinned tweet - main_trove := entry.ToTweetTrove() - ret.MergeWith(main_trove) + // Parse all of the entries, and attempt to do tombstone reply-joining as we go + for i, entry := range api_response.GetMainInstruction().Entries { // TODO: the second Instruction is the pinned tweet in a User Feed + ret.MergeWith(entry.ToTweetTrove()) + + // Only do tombstone reply-joining on a Tweet Detail thread (not a User Feed!) + if len(api_response.Data.ThreadedConversationWithInjectionsV2.Instructions) == 0 { + continue + } + // Skip the first entry since it doesn't have a parent + if i == 0 { + continue + } + // Infer "in_reply_to_id" for tombstoned tweets from the order of entries, if applicable + if entry.Content.EntryType != "TimelineTimelineItem" { + // Only check back up the parent thread, which will all be "TimelineTimelineItems". + // i.e., skip replies + // TODO: maybe don't skip replies? + continue + } + entry_type, main_tweet_id := entry.ParseID() + if entry_type == "cursor-showmorethreadsprompt" || entry_type == "cursor-bottom" { + // Skip cursors + // - "cursor-bottom" => load more high-quality replies + // - "cursor-showmorethreadsprompt" => "Show additional replies, including those that may contain offensive content" + continue + } + if entry_type != "tweet" { + // TODO: discovery panic + panic(fmt.Sprintf("Unexpected first part of entry id: %q", entry_type)) + } + main_tweet, is_ok := ret.Tweets[main_tweet_id] + if !is_ok { + // On a User Feed, the entry ID could also be a Retweet ID, but we should only reply-join on Tweet Detail views. + panic(fmt.Sprintf("Entry didn't parse correctly: %q", entry.EntryID)) + } + if !main_tweet.IsStub || main_tweet.InReplyToID != TweetID(0) { + // Not a tombstone; ignore + continue + } + _, prev_entry_id := api_response.GetMainInstruction().Entries[i-1].ParseID() + main_tweet.InReplyToID = prev_entry_id + ret.Tweets[main_tweet_id] = main_tweet } // Add in any tombstoned user handles and IDs if possible, by reading from the replies diff --git a/scraper/api_types_v2_test.go b/scraper/api_types_v2_test.go index 53f85a2..e1093a7 100644 --- a/scraper/api_types_v2_test.go +++ b/scraper/api_types_v2_test.go @@ -597,34 +597,35 @@ func TestAPIV2ConversationThreadWithTombstones(t *testing.T) { require.NoError(t, err) assert.Len(trove.Tweets, 4) - // t1, is_ok := trove.Tweets[1454515503242829830] - // assert.True(is_ok) - // assert.True(t1.IsStub) - // assert.Equal(TweetID(0), t1.InReplyToID) - // // TODO: assert associated user is fake + t1, is_ok := trove.Tweets[1454515503242829830] + assert.True(is_ok) + assert.True(t1.IsStub) + assert.Equal(TweetID(0), t1.InReplyToID) + // TODO: assert associated user is fake - // t2, is_ok := trove.Tweets[1454521424144654344] - // assert.True(is_ok) - // assert.True(t2.IsStub) - // assert.Equal(TweetID(1454515503242829830), t2.InReplyToID) + t2, is_ok := trove.Tweets[1454521424144654344] + assert.True(is_ok) + assert.True(t2.IsStub) + assert.Equal(TweetID(1454515503242829830), t2.InReplyToID) t3, is_ok := trove.Tweets[1454522147750260742] assert.True(is_ok) assert.True(t3.IsStub) - // assert.Equal(TweetID(1454521424144654344), t3.InReplyToID) + assert.Equal(TweetID(1454521424144654344), t3.InReplyToID) assert.Equal(UserID(1365863538393309184), t3.UserID) t3_user, is_ok := trove.Users[t3.UserID] assert.True(is_ok) + assert.False(t3_user.IsIdFake) assert.Equal(UserHandle("itsbackwereover"), t3_user.Handle) t4, is_ok := trove.Tweets[1454526270809726977] assert.True(is_ok) assert.False(t4.IsStub) assert.Equal(TweetID(1454522147750260742), t4.InReplyToID) - _, is_ok = trove.Users[t4.UserID] + t4_user, is_ok := trove.Users[t4.UserID] assert.True(is_ok) + assert.False(t4_user.IsIdFake) - // assert.Len(trove.Users, 4) assert.Len(trove.Retweets, 0) }