From aa961b9ff40880b4545dea5cf6a0a450445d3074 Mon Sep 17 00:00:00 2001 From: Alessio Date: Tue, 1 Mar 2022 11:51:34 -0800 Subject: [PATCH] Fix handling of empty entries in the feed - e.g., retweets where the author then deleted the original tweet - I think this is a bug in the Twitter API (it doesn't make sense to return this data) but we have to handle it anyway --- scraper/api_types_v2.go | 14 ++++-- scraper/api_types_v2_test.go | 46 ++++++++++++------- .../api_v2/tombstone_tweet.json | 1 + 3 files changed, 40 insertions(+), 21 deletions(-) create mode 100644 scraper/test_responses/api_v2/tombstone_tweet.json diff --git a/scraper/api_types_v2.go b/scraper/api_types_v2.go index 2a82100..c4b603a 100644 --- a/scraper/api_types_v2.go +++ b/scraper/api_types_v2.go @@ -152,9 +152,15 @@ type APIV2Result struct { Tweet _Result `json:"tweet"` } `json:"result"` } -func (api_result APIV2Result) ToTweetTrove() TweetTrove { +func (api_result APIV2Result) ToTweetTrove(ignore_null_entries bool) TweetTrove { ret := NewTweetTrove() + // Start by checking if this is a null entry in a feed + if api_result.Result.Tombstone != nil && ignore_null_entries{ + // TODO: this is becoming really spaghetti. Why do we need a separate execution path for this? + return ret + } + if api_result.Result.Legacy.ID == 0 && api_result.Result.Tweet.Legacy.ID != 0 { // If the tweet has "__typename" of "TweetWithVisibilityResults", it uses a new structure with // a "tweet" field with the regular data, alongside a "tweetInterstitial" field which is ignored @@ -194,7 +200,7 @@ func (api_result APIV2Result) ToTweetTrove() TweetTrove { ret.TombstoneUsers = append(ret.TombstoneUsers, handle) } - quoted_trove := api_result.Result.QuotedStatusResult.ToTweetTrove() + quoted_trove := quoted_api_result.ToTweetTrove(false) ret.MergeWith(quoted_trove) } @@ -248,7 +254,7 @@ func (api_v2_tweet APIV2Tweet) ToTweetTrove() TweetTrove { // If there's a retweet, we ignore the main tweet except for posted_at and retweeting UserID if api_v2_tweet.RetweetedStatusResult != nil { - orig_tweet_trove := api_v2_tweet.RetweetedStatusResult.ToTweetTrove() + orig_tweet_trove := api_v2_tweet.RetweetedStatusResult.ToTweetTrove(false) ret.MergeWith(orig_tweet_trove) @@ -354,7 +360,7 @@ func (api_response APIV2Response) ToTweetTrove() (TweetTrove, error) { result := entry.Content.ItemContent.TweetResults - main_trove := result.ToTweetTrove() + main_trove := result.ToTweetTrove(true) ret.MergeWith(main_trove) } diff --git a/scraper/api_types_v2_test.go b/scraper/api_types_v2_test.go index fd9a288..ff3ba01 100644 --- a/scraper/api_types_v2_test.go +++ b/scraper/api_types_v2_test.go @@ -7,6 +7,7 @@ import ( "fmt" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" . "offline_twitter/scraper" ) @@ -61,7 +62,7 @@ func TestAPIV2ParseTweet(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove() + trove := tweet_result.ToTweetTrove(true) assert.Equal(1, len(trove.Tweets)) tweet, ok := trove.Tweets[1485708879174508550] @@ -108,7 +109,7 @@ func TestAPIV2ParseTweetWithQuotedTweet(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove() + trove := tweet_result.ToTweetTrove(true) // Should be 2 tweets: quote-tweet and quoted-tweet assert.Equal(2, len(trove.Tweets)) @@ -161,7 +162,7 @@ func TestAPIV2ParseRetweet(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove() + trove := tweet_result.ToTweetTrove(true) // Should only be 1 tweet, the retweeted one assert.Equal(1, len(trove.Tweets)) @@ -221,7 +222,7 @@ func TestAPIV2ParseRetweetedQuoteTweet(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove() + trove := tweet_result.ToTweetTrove(true) // Quoted tweet and quoting tweet assert.Equal(2, len(trove.Tweets)) @@ -281,7 +282,7 @@ func TestAPIV2ParseTweetWithQuotedTombstone(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove() + trove := tweet_result.ToTweetTrove(true) assert.Equal(1, len(trove.Users)) user, ok := trove.Users[44067298] @@ -317,7 +318,7 @@ func TestAPIV2ParseTweetWithURL(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove() + trove := tweet_result.ToTweetTrove(true) assert.Equal(1, len(trove.Tweets)) tweet, ok := trove.Tweets[1485695695025803264] @@ -352,7 +353,7 @@ func TestAPIV2ParseTweetWithURLPlayerCard(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove() + trove := tweet_result.ToTweetTrove(true) assert.Equal(1, len(trove.Tweets)) tweet, ok := trove.Tweets[1485504913614327808] @@ -384,7 +385,7 @@ func TestAPIV2ParseTweetWithURLRetweet(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove() + trove := tweet_result.ToTweetTrove(true) assert.Equal(1, len(trove.Tweets)) tweet, ok := trove.Tweets[1488605073588559873] @@ -411,7 +412,7 @@ func TestAPIV2ParseTweetWithPoll(t *testing.T) { err = json.Unmarshal(data, &tweet_result) assert.NoError(err) - trove := tweet_result.ToTweetTrove() + trove := tweet_result.ToTweetTrove(true) assert.Len(trove.Tweets, 1) tweet, ok := trove.Tweets[1485692111106285571] @@ -541,26 +542,37 @@ func TestAPIV2GetMainInstructionFromFeed(t *testing.T) { } /** - * Should handle an entry in the feed that's just a tombstone + * Should handle an entry in the feed that's a tombstone by just ignoring it + * Expectation: random tombstones in the feed with no context should parse as empty TweetTroves. + * + * The indication that it's from a feed (i.e., not in a comments thread) is 'ToTweetTrove(true)'. + * On a reply thread, it would be 'ToTweetTrove(false)'. */ func TestAPIV2TombstoneEntry(t *testing.T) { + assert := assert.New(t) + data, err := ioutil.ReadFile("test_responses/api_v2/tombstone_tweet.json") + require.NoError(t, err) + var tweet_result APIV2Result + err = json.Unmarshal(data, &tweet_result) + require.NoError(t, err) + + trove := tweet_result.ToTweetTrove(true) // 'true' indicates to ignore empty entries + assert.Len(trove.Tweets, 0) + assert.Len(trove.Users, 0) + assert.Len(trove.Retweets, 0) } func TestTweetWithWarning(t *testing.T) { assert := assert.New(t) data, err := ioutil.ReadFile("test_responses/api_v2/tweet_with_warning.json") - if err != nil { - panic(err) - } + require.NoError(t, err) var tweet_result APIV2Result err = json.Unmarshal(data, &tweet_result) - if err != nil { - t.Errorf(err.Error()) - } + require.NoError(t, err) - trove := tweet_result.ToTweetTrove() + trove := tweet_result.ToTweetTrove(true) assert.Len(trove.Retweets, 1) assert.Len(trove.Tweets, 2) diff --git a/scraper/test_responses/api_v2/tombstone_tweet.json b/scraper/test_responses/api_v2/tombstone_tweet.json new file mode 100644 index 0000000..1a9d3ba --- /dev/null +++ b/scraper/test_responses/api_v2/tombstone_tweet.json @@ -0,0 +1 @@ +{"result":{"__typename":"TweetTombstone","tombstone":{"__typename":"TextTombstone","text":{"rtl":false,"text":"This Tweet was deleted by the Tweet author. Learn more","entities":[{"fromIndex":44,"toIndex":54,"ref":{"type":"TimelineUrl","url":"https://help.twitter.com/rules-and-policies/notices-on-twitter","urlType":"ExternalUrl"}}]}}}}