From 63ddaaeafba7a981f38d64c67ae27b5617a6fad4 Mon Sep 17 00:00:00 2001 From: Alessio Date: Sat, 14 Oct 2023 18:02:08 -0300 Subject: [PATCH] Scraper now handles implicit tombstones like '"quoted_status_result": {}' --- pkg/scraper/api_types_v2.go | 37 +++++++++++-------- pkg/scraper/api_types_v2_test.go | 23 +++++++++++- .../tweet_with_implicit_quoted_tombstone.json | 1 + pkg/scraper/tweet.go | 2 +- 4 files changed, 46 insertions(+), 17 deletions(-) create mode 100644 pkg/scraper/test_responses/api_v2/tweet_with_implicit_quoted_tombstone.json diff --git a/pkg/scraper/api_types_v2.go b/pkg/scraper/api_types_v2.go index 19294df..84a5a56 100644 --- a/pkg/scraper/api_types_v2.go +++ b/pkg/scraper/api_types_v2.go @@ -143,14 +143,15 @@ func (u APIV2UserResult) ToUser() User { return user } +type Tombstone struct { + Text struct { + Text string `json:"text"` + } `json:"text"` +} type _Result struct { - ID int64 `json:"rest_id,string"` - Legacy APIV2Tweet `json:"legacy"` - Tombstone *struct { - Text struct { - Text string `json:"text"` - } `json:"text"` - } `json:"tombstone"` + ID int64 `json:"rest_id,string"` + Legacy APIV2Tweet `json:"legacy"` + Tombstone *Tombstone `json:"tombstone"` Core *APIV2UserResult `json:"core"` Card APIV2Card `json:"card"` QuotedStatusResult *APIV2Result `json:"quoted_status_result"` @@ -213,6 +214,16 @@ func (api_result APIV2Result) ToTweetTrove() (TweetTrove, error) { quoted_api_result := api_result.Result.QuotedStatusResult quoted_trove, err := quoted_api_result.ToTweetTrove() + // Handle `"quoted_status_result": {}` results + if errors.Is(err, ERR_NO_TWEET) { + // Replace it with a tombstone + err = ErrorIsTombstone + if quoted_api_result.Result.Tombstone == nil { + quoted_api_result.Result.Tombstone = &Tombstone{} + } + quoted_api_result.Result.Tombstone.Text.Text = "This Post is unavailable. Learn more" + } + // Quoted tombstones can be handled here since we already have the ID and user handle if errors.Is(err, ErrorIsTombstone) { tombstoned_tweet := quoted_api_result.Result.Legacy.APITweet @@ -256,9 +267,9 @@ func (api_result APIV2Result) ToTweetTrove() (TweetTrove, error) { if api_result.Result.Legacy.RetweetedStatusResult == nil { // We have to filter out retweets. For some reason, retweets have a copy of the card in both the retweeting // and the retweeted TweetResults; it should only be parsed for the real Tweet, not the Retweet - main_tweet, is_ok := ret.Tweets[TweetID(api_result.Result.Legacy.ID)] + main_tweet, is_ok := ret.Tweets[TweetID(api_result.Result.ID)] if !is_ok { - panic(fmt.Errorf("Tweet trove didn't contain its own tweet with ID %d:\n %w", api_result.Result.Legacy.ID, EXTERNAL_API_ERROR)) + return TweetTrove{}, ERR_NO_TWEET } if api_result.Result.Card.Legacy.Name == "summary_large_image" || api_result.Result.Card.Legacy.Name == "player" { url := api_result.Result.Card.ParseAsUrl() @@ -595,9 +606,7 @@ func (api_response APIV2Response) GetCursorBottom() string { return "" } -/** - * Returns `true` if there's no non-cursor entries in this response, false otherwise - */ +// Returns `true` if there's no non-cursor entries in this response, false otherwise func (api_response APIV2Response) IsEmpty() bool { for _, e := range api_response.GetMainInstruction().Entries { if !strings.Contains(e.EntryID, "cursor") { @@ -607,9 +616,7 @@ func (api_response APIV2Response) IsEmpty() bool { return true } -/** - * Parse the collected API response and turn it into a TweetTrove - */ +// Parse the collected API response and turn it into a TweetTrove func (api_response APIV2Response) ToTweetTrove() (TweetTrove, error) { ret := NewTweetTrove() diff --git a/pkg/scraper/api_types_v2_test.go b/pkg/scraper/api_types_v2_test.go index 20fdfbe..aa5d1e8 100644 --- a/pkg/scraper/api_types_v2_test.go +++ b/pkg/scraper/api_types_v2_test.go @@ -595,7 +595,6 @@ func TestAPIV2UserFeedTombstoneEntry(t *testing.T) { require.NoError(t, err) trove := entry.ToTweetTrove() - assert.NoError(err) // assert.Len(trove.Tweets, 1) // assert.Len(trove.Users, 1) assert.Len(trove.Retweets, 0) @@ -917,3 +916,25 @@ func TestParseResultAsLikes(t *testing.T) { assert.True(is_ok, "Like (%#v) didn't have its Tweet in the trove", l) } } + +func TestTweetWithImplicitQuotedTombstone(t *testing.T) { + assert := assert.New(t) + require := require.New(t) + data, err := os.ReadFile("test_responses/api_v2/tweet_with_implicit_quoted_tombstone.json") + require.NoError(err) + var entry_result APIV2Entry + err = json.Unmarshal(data, &entry_result) + require.NoError(err) + + trove := entry_result.ToTweetTrove() + + assert.Len(trove.Tweets, 2) + + t1, is_ok := trove.Tweets[TweetID(1586033916367904768)] + assert.True(is_ok) + assert.False(t1.IsStub) + t2, is_ok := trove.Tweets[TweetID(1586033437806305280)] + assert.True(is_ok) + assert.True(t2.IsStub) + assert.Equal(t2.TombstoneType, "unavailable") +} diff --git a/pkg/scraper/test_responses/api_v2/tweet_with_implicit_quoted_tombstone.json b/pkg/scraper/test_responses/api_v2/tweet_with_implicit_quoted_tombstone.json new file mode 100644 index 0000000..fd3a389 --- /dev/null +++ b/pkg/scraper/test_responses/api_v2/tweet_with_implicit_quoted_tombstone.json @@ -0,0 +1 @@ +{"entryId":"tweet-1586033916367904768","sortIndex":"1713285565408870280","content":{"entryType":"TimelineTimelineItem","__typename":"TimelineTimelineItem","itemContent":{"itemType":"TimelineTweet","__typename":"TimelineTweet","tweet_results":{"result":{"__typename":"Tweet","rest_id":"1586033916367904768","core":{"user_results":{"result":{"__typename":"User","id":"VXNlcjo0NDA2NzI5OA==","rest_id":"44067298","affiliates_highlighted_label":{},"has_graduated_access":true,"is_blue_verified":true,"profile_image_shape":"Circle","legacy":{"blocked_by":false,"blocking":false,"follow_request_sent":false,"followed_by":false,"following":false,"muting":false,"notifications":false,"protected":false,"can_dm":false,"can_media_tag":false,"created_at":"Tue Jun 02 05:35:52 +0000 2009","default_profile":false,"default_profile_image":false,"description":"Author: Dear Reader, The New Right, The Anarchist Handbook & The White Pill \nHost: \"YOUR WELCOME\" \nSubject: Ego & Hubris by Harvey Pekar\nHe/Him ⚑","entities":{"description":{"urls":[]},"url":{"urls":[{"display_url":"amzn.to/3oInafv","expanded_url":"https://amzn.to/3oInafv","url":"https://t.co/7VDFOOtFK2","indices":[0,23]}]}},"fast_followers_count":0,"favourites_count":4831,"followers_count":658993,"friends_count":1040,"has_custom_timelines":true,"is_translator":false,"listed_count":2221,"location":"Austin","media_count":13701,"name":"Michael Malice","normal_followers_count":658993,"pinned_tweet_ids_str":["1712211529905353086"],"possibly_sensitive":false,"profile_banner_url":"https://pbs.twimg.com/profile_banners/44067298/1664774013","profile_image_url_https":"https://pbs.twimg.com/profile_images/1415820415314931715/_VVX4GI8_normal.jpg","profile_interstitial_type":"","screen_name":"michaelmalice","statuses_count":162731,"translator_type":"none","url":"https://t.co/7VDFOOtFK2","verified":false,"want_retweets":false,"withheld_in_countries":[]},"has_nft_avatar":false,"super_follow_eligible":false,"super_followed_by":false,"super_following":false}}},"edit_control":{"edit_tweet_ids":["1586033916367904768"],"editable_until_msecs":"1666976727000","is_edit_eligible":false,"edits_remaining":"5"},"is_translatable":false,"source":"Twitter Web App","quoted_status_result":{},"legacy":{"bookmark_count":3,"bookmarked":false,"created_at":"Fri Oct 28 16:35:27 +0000 2022","conversation_id_str":"1586033916367904768","display_text_range":[0,65],"entities":{"user_mentions":[],"urls":[],"hashtags":[],"symbols":[]},"favorite_count":436,"favorited":false,"full_text":"the Constitution does not grant rights, it claims to protect them","is_quote_status":true,"lang":"en","quote_count":1,"quoted_status_id_str":"1586033437806305280","quoted_status_permalink":{"url":"https://t.co/r6ZFruksos","expanded":"https://twitter.com/SloCobruh/status/1586033437806305280","display":"twitter.com/SloCobruh/stat…"},"reply_count":16,"retweet_count":27,"retweeted":false,"user_id_str":"44067298","id_str":"1586033916367904768"}}},"tweetDisplayType":"Tweet","highlights":{"textHighlights":[{"startIndex":4,"endIndex":16}]}},"feedbackInfo":{"feedbackKeys":["1060665035"]},"clientEventInfo":{"component":"result","element":"tweet","details":{"timelinesDetails":{"controllerData":"DAACDAAFDAABDAABDAABCgABAAAAAAAAAAAAAAwAAgoAAQAAAAAAAABACgAC+2hCBb/AmpQKAAVD6ekwp16RtQgABgAAAAAKAAdChvNiX2bzlgAAAAAA"}}}}} diff --git a/pkg/scraper/tweet.go b/pkg/scraper/tweet.go index 65020c4..e889859 100644 --- a/pkg/scraper/tweet.go +++ b/pkg/scraper/tweet.go @@ -131,7 +131,7 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { ret.PostedAt, err = TimestampFromString(apiTweet.CreatedAt) if err != nil { if ret.ID == 0 { - return Tweet{}, fmt.Errorf("unable to parse tweet:\n %w", ERR_NO_TWEET) + return Tweet{}, fmt.Errorf("unable to parse tweet: %w", ERR_NO_TWEET) } return Tweet{}, fmt.Errorf("Error parsing time on tweet ID %d:\n %w", ret.ID, err) }