From 21581b325aafd1ef7d224b5711960804ed967338 Mon Sep 17 00:00:00 2001 From: Alessio Date: Thu, 8 Jun 2023 23:19:50 -0300 Subject: [PATCH] Add support for parsing Tweet Detail in APIv2, including its unusual cursor format and conversation threads --- scraper/api_types_v2.go | 53 ++++++++++++++----- scraper/api_types_v2_test.go | 22 ++++++++ ...on_thread_entry_with_show_more_button.json | 1 + 3 files changed, 62 insertions(+), 14 deletions(-) create mode 100644 scraper/test_responses/api_v2/conversation_thread_entry_with_show_more_button.json diff --git a/scraper/api_types_v2.go b/scraper/api_types_v2.go index 67e7795..876a373 100644 --- a/scraper/api_types_v2.go +++ b/scraper/api_types_v2.go @@ -334,6 +334,10 @@ func (api_v2_tweet APIV2Tweet) ToTweetTrove() TweetTrove { type ItemContent struct { ItemType string `json:"itemType"` TweetResults APIV2Result `json:"tweet_results"` + + // Cursors (conversation view format) + CursorType string `json:"cursorType"` + Value string `json:"value"` } // Wraps InnerAPIV2Entry to implement `json.Unmarshal`. Does the normal unmarshal but also saves the original JSON. @@ -355,7 +359,7 @@ type InnerAPIV2Entry struct { } } - // Cursors + // Cursors (user feed format) EntryType string `json:"entryType"` Value string `json:"value"` CursorType string `json:"cursorType"` @@ -378,22 +382,29 @@ func (e APIV2Entry) ToTweetTrove(ignore_null_entries bool) TweetTrove { panic(obj) } }() - if e.Content.EntryType == "TimelineTimelineCursor" { - // Ignore cursor entries + if e.Content.EntryType == "TimelineTimelineCursor" || e.Content.ItemContent.ItemType == "TimelineTimelineCursor" { + // Ignore cursor entries. + // - e.Content.EntryType -> User Feed itself + // - e.Content.ItemContent.ItemType -> conversation thread in a user feed return NewTweetTrove() } else if e.Content.EntryType == "TimelineTimelineModule" { ret := NewTweetTrove() switch strings.Split(e.EntryID, "-")[0] { - case "homeConversation": - // Process it + case "homeConversation", "conversationthread": + // Process it. + // - "homeConversation": conversation thread on a user feed + // - "conversationthread": conversation thread in the replies under a TweetDetail view for _, item := range e.Content.Items { + if item.Item.ItemContent.ItemType == "TimelineTimelineCursor" { + // "Show More" replies button in a thread on Tweet Detail page + continue + } ret.MergeWith(item.Item.ItemContent.TweetResults.ToTweetTrove(ignore_null_entries)) } - case "whoToFollow": - case "TopicsModule": - // Ignore "Who to follow" and "Topics" modules. + case "whoToFollow", "TopicsModule", "tweetdetailrelatedtweets": + // Ignore "Who to follow", "Topics" and "Related Tweets" modules. // TODO: maybe we can capture these eventually log.Debug(fmt.Sprintf("Skipping %s entry", e.EntryID)) @@ -424,6 +435,9 @@ type APIV2Response struct { } `json:"timeline"` } `json:"result"` } `json:"user"` + ThreadedConversationWithInjectionsV2 struct { + Instructions []APIV2Instruction `json:"instructions"` + } `json:"threaded_conversation_with_injections_v2"` } `json:"data"` } @@ -434,17 +448,28 @@ func (api_response APIV2Response) GetMainInstruction() *APIV2Instruction { return &instructions[i] } } + instructions = api_response.Data.ThreadedConversationWithInjectionsV2.Instructions + for i := range instructions { + if instructions[i].Type == "TimelineAddEntries" { + return &instructions[i] + } + } panic("No 'TimelineAddEntries' found") } func (api_response APIV2Response) GetCursorBottom() string { - entries := api_response.GetMainInstruction().Entries - last_entry := entries[len(entries)-1] - if last_entry.Content.CursorType != "Bottom" { - panic("No bottom cursor found") - } + for _, entry := range api_response.GetMainInstruction().Entries { + // For a user feed: + if entry.Content.CursorType == "Bottom" { + return entry.Content.Value + } - return last_entry.Content.Value + // For a Tweet Detail page: + if entry.Content.ItemContent.CursorType == "Bottom" { + return entry.Content.ItemContent.Value + } + } + return "" } /** diff --git a/scraper/api_types_v2_test.go b/scraper/api_types_v2_test.go index 71a7181..9254b48 100644 --- a/scraper/api_types_v2_test.go +++ b/scraper/api_types_v2_test.go @@ -680,3 +680,25 @@ func TestEntryWithConversationThread(t *testing.T) { _, is_ok = trove.Tweets[1624990170670850053] // Tweet 3 assert.True(is_ok) } + +// On a Tweet Detail page, there's a thread of replies, and then it says "Show more..." underneath +// to extend the conversation. This is different from the "Show more..." button to load more +// replies to the original tweet! +func TestConversationThreadEntryWithShowMoreButton(t *testing.T) { + assert := assert.New(t) + require := require.New(t) + data, err := os.ReadFile("test_responses/api_v2/conversation_thread_entry_with_show_more_button.json") + require.NoError(err) + var entry_result APIV2Entry + err = json.Unmarshal(data, &entry_result) + require.NoError(err) + + trove := entry_result.ToTweetTrove(true) + + assert.Len(trove.Tweets, 1) + t1, is_ok := trove.Tweets[1649803385485377536] + assert.True(is_ok) + assert.Equal(TweetID(1649600354747572225), t1.InReplyToID) + + assert.Len(trove.Users, 1) +} diff --git a/scraper/test_responses/api_v2/conversation_thread_entry_with_show_more_button.json b/scraper/test_responses/api_v2/conversation_thread_entry_with_show_more_button.json new file mode 100644 index 0000000..cfe5405 --- /dev/null +++ b/scraper/test_responses/api_v2/conversation_thread_entry_with_show_more_button.json @@ -0,0 +1 @@ +{"entryId":"conversationthread-1649803385485377536","sortIndex":"7573771682107203542","content":{"entryType":"TimelineTimelineModule","__typename":"TimelineTimelineModule","items":[{"entryId":"conversationthread-1649803385485377536-tweet-1649803385485377536","item":{"itemContent":{"itemType":"TimelineTweet","__typename":"TimelineTweet","tweet_results":{"result":{"__typename":"Tweet","rest_id":"1649803385485377536","has_birdwatch_notes":false,"core":{"user_results":{"result":{"__typename":"User","id":"VXNlcjoxNDAzMTAzMg==","rest_id":"14031032","affiliates_highlighted_label":{},"is_blue_verified":true,"profile_image_shape":"Circle","legacy":{"created_at":"Tue Feb 26 22:01:28 +0000 2008","default_profile":false,"default_profile_image":false,"description":"VP marketing @AdQuick, out of home advertising made simple & measurable. Prev work: Google, Invitae, Marketo, etc","entities":{"description":{"urls":[]},"url":{"urls":[{"display_url":"adamsinger.substack.com/welcome","expanded_url":"https://adamsinger.substack.com/welcome","url":"https://t.co/6kI9bzQ2eV","indices":[0,23]}]}},"fast_followers_count":0,"favourites_count":355788,"followers_count":81891,"friends_count":4200,"has_custom_timelines":true,"is_translator":false,"listed_count":3083,"location":"Austin, TX","media_count":31104,"name":"Adam Singer","normal_followers_count":81891,"pinned_tweet_ids_str":[],"possibly_sensitive":false,"profile_banner_url":"https://pbs.twimg.com/profile_banners/14031032/1663584125","profile_image_url_https":"https://pbs.twimg.com/profile_images/1526507327574220804/vDv7S4U7_normal.jpg","profile_interstitial_type":"","screen_name":"AdamSinger","statuses_count":313313,"translator_type":"none","url":"https://t.co/6kI9bzQ2eV","verified":false,"withheld_in_countries":[]}}}},"edit_control":{"edit_tweet_ids":["1649803385485377536"],"editable_until_msecs":"1682180553000","is_edit_eligible":false,"edits_remaining":"5"},"is_translatable":false,"views":{"count":"1755","state":"EnabledWithCount"},"source":"Twitter Web App","legacy":{"bookmark_count":0,"bookmarked":false,"created_at":"Sat Apr 22 15:52:33 +0000 2023","conversation_id_str":"1649600354747572225","display_text_range":[13,143],"entities":{"user_mentions":[{"id_str":"886358633646350340","name":"LindyMan","screen_name":"PaulSkallas","indices":[0,12]}],"urls":[],"hashtags":[],"symbols":[]},"favorite_count":16,"favorited":false,"full_text":"@PaulSkallas Clickabait = fast/cheap attention = forgotten just as quickly. What do you think happens to all the pop music of the same variety?","in_reply_to_screen_name":"PaulSkallas","in_reply_to_status_id_str":"1649600354747572225","in_reply_to_user_id_str":"886358633646350340","is_quote_status":false,"lang":"en","quote_count":0,"reply_count":1,"retweet_count":0,"retweeted":false,"user_id_str":"14031032","id_str":"1649803385485377536"},"quick_promote_eligibility":{"eligibility":"IneligibleUserUnauthorized"}}},"tweetDisplayType":"Tweet"},"clientEventInfo":{"details":{"conversationDetails":{"conversationSection":"HighQuality"},"timelinesDetails":{"controllerData":"DAACDAAEDAABCgABFSACDDADgAUKAAIAAAAAGADACAAAAAA="}}}}},{"entryId":"conversationthread-1649803385485377536-cursor-showmore-6525681801715054743","item":{"itemContent":{"itemType":"TimelineTimelineCursor","__typename":"TimelineTimelineCursor","value":"PAAAAPAtPBwcFoCAvtGE3KPlLRUCAAAYJmNvbnZlcnNhdGlvbnRocmVhZC0xNjQ5ODAzMzg1NDg1Mzc3NTM2IgAA","cursorType":"ShowMore","displayTreatment":{"actionText":"Show replies"}},"clientEventInfo":{"details":{"conversationDetails":{"conversationSection":"HighQuality"}}}}}],"displayType":"VerticalConversation","clientEventInfo":{"details":{"conversationDetails":{"conversationSection":"HighQuality"}}}}}