Add parsing of tombstones in immediate parent replies in Tweet Detail views for APIv2

This commit is contained in:
Alessio 2023-06-17 15:09:20 -03:00
parent 21581b325a
commit 1226e3947e
4 changed files with 215 additions and 50 deletions

View File

@ -2,6 +2,7 @@ package scraper
import (
"encoding/json"
"errors"
"fmt"
"net/url"
"strings"
@ -9,6 +10,8 @@ import (
log "github.com/sirupsen/logrus"
)
var ErrorIsTombstone = errors.New("tweet is a tombstone")
type CardValue struct {
Type string `json:"type"`
StringValue string `json:"string_value"`
@ -169,13 +172,14 @@ type APIV2Result struct {
} `json:"result"`
}
func (api_result APIV2Result) ToTweetTrove(ignore_null_entries bool) TweetTrove {
func (api_result APIV2Result) ToTweetTrove() (TweetTrove, error) {
ret := NewTweetTrove()
// Start by checking if this is a null entry in a feed
if api_result.Result.Tombstone != nil && ignore_null_entries {
// TODO: this is becoming really spaghetti. Why do we need a separate execution path for this?
return ret
if api_result.Result.Tombstone != nil {
// Returning an error indicates the parent (APIV2Entry) has to parse it as a tombstone.
// The tweet ID isn't available to the APIV2Result, but it is to the APIV2Entry.
return ret, ErrorIsTombstone
}
if api_result.Result.Legacy.ID == 0 && api_result.Result.Tweet.Legacy.ID != 0 {
@ -207,25 +211,42 @@ func (api_result APIV2Result) ToTweetTrove(ignore_null_entries bool) TweetTrove
// Handle quoted tweet
if api_result.Result.QuotedStatusResult != nil {
quoted_api_result := api_result.Result.QuotedStatusResult
quoted_trove, err := quoted_api_result.ToTweetTrove()
// Quoted tweets might be tombstones!
if quoted_api_result.Result.Tombstone != nil {
tombstoned_tweet := &quoted_api_result.Result.Legacy.APITweet
var ok bool
tombstoned_tweet.TombstoneText, ok = tombstone_types[quoted_api_result.Result.Tombstone.Text.Text]
if !ok {
// Quoted tombstones can be handled here since we already have the ID and user handle
if errors.Is(err, ErrorIsTombstone) {
tombstoned_tweet := quoted_api_result.Result.Legacy.APITweet
// Capture the tombstone text
var is_ok bool
tombstoned_tweet.TombstoneText, is_ok = tombstone_types[quoted_api_result.Result.Tombstone.Text.Text]
if !is_ok {
panic(fmt.Errorf("Unknown tombstone text %q:\n %w", quoted_api_result.Result.Tombstone.Text.Text, EXTERNAL_API_ERROR))
}
// Capture the tombstone ID
tombstoned_tweet.ID = int64(int_or_panic(api_result.Result.Legacy.APITweet.QuotedStatusIDStr))
// Capture the tombstone's user handle
handle, err := ParseHandleFromTweetUrl(api_result.Result.Legacy.APITweet.QuotedStatusPermalink.ExpandedURL)
if err != nil {
panic(err)
}
tombstoned_tweet.UserHandle = string(handle)
// Parse the tombstone into a Tweet and add it to the trove
parsed_tombstone_tweet, err := ParseSingleTweet(tombstoned_tweet)
if err != nil {
panic(err)
}
ret.Tweets[parsed_tombstone_tweet.ID] = parsed_tombstone_tweet
// Add the user as a tombstoned user to be fetched later
ret.TombstoneUsers = append(ret.TombstoneUsers, handle)
} else if err != nil {
panic(err)
}
quoted_trove := quoted_api_result.ToTweetTrove(false)
ret.MergeWith(quoted_trove)
}
@ -235,8 +256,8 @@ func (api_result APIV2Result) ToTweetTrove(ignore_null_entries bool) TweetTrove
if api_result.Result.Legacy.RetweetedStatusResult == nil {
// We have to filter out retweets. For some reason, retweets have a copy of the card in both the retweeting
// and the retweeted TweetResults; it should only be parsed for the real Tweet, not the Retweet
main_tweet, ok := ret.Tweets[TweetID(api_result.Result.Legacy.ID)]
if !ok {
main_tweet, is_ok := ret.Tweets[TweetID(api_result.Result.Legacy.ID)]
if !is_ok {
panic(fmt.Errorf("Tweet trove didn't contain its own tweet with ID %d:\n %w", api_result.Result.Legacy.ID, EXTERNAL_API_ERROR))
}
if api_result.Result.Card.Legacy.Name == "summary_large_image" || api_result.Result.Card.Legacy.Name == "player" {
@ -284,7 +305,7 @@ func (api_result APIV2Result) ToTweetTrove(ignore_null_entries bool) TweetTrove
}
}
return ret
return ret, nil
}
type APIV2Tweet struct {
@ -299,11 +320,13 @@ func (api_v2_tweet APIV2Tweet) ToTweetTrove() TweetTrove {
// If there's a retweet, we ignore the main tweet except for posted_at and retweeting UserID
if api_v2_tweet.RetweetedStatusResult != nil {
orig_tweet_trove := api_v2_tweet.RetweetedStatusResult.ToTweetTrove(false)
orig_tweet_trove, err := api_v2_tweet.RetweetedStatusResult.ToTweetTrove()
if err != nil {
panic(err)
}
ret.MergeWith(orig_tweet_trove)
retweet := Retweet{}
var err error
retweet.RetweetID = TweetID(api_v2_tweet.ID)
if api_v2_tweet.RetweetedStatusResult.Result.Legacy.ID == 0 && api_v2_tweet.RetweetedStatusResult.Result.Tweet.Legacy.ID != 0 {
@ -375,7 +398,7 @@ func (e *APIV2Entry) UnmarshalJSON(data []byte) error {
return nil
}
func (e APIV2Entry) ToTweetTrove(ignore_null_entries bool) TweetTrove {
func (e APIV2Entry) ToTweetTrove() TweetTrove {
defer func() {
if obj := recover(); obj != nil {
log.Warn(fmt.Sprintf("Panic while decoding entry: %s\n", e.OriginalJSON))
@ -400,7 +423,11 @@ func (e APIV2Entry) ToTweetTrove(ignore_null_entries bool) TweetTrove {
// "Show More" replies button in a thread on Tweet Detail page
continue
}
ret.MergeWith(item.Item.ItemContent.TweetResults.ToTweetTrove(ignore_null_entries))
trove, err := item.Item.ItemContent.TweetResults.ToTweetTrove()
if err != nil {
panic(err)
}
ret.MergeWith(trove)
}
case "whoToFollow", "TopicsModule", "tweetdetailrelatedtweets":
@ -414,7 +441,35 @@ func (e APIV2Entry) ToTweetTrove(ignore_null_entries bool) TweetTrove {
return ret
} else if e.Content.EntryType == "TimelineTimelineItem" {
return e.Content.ItemContent.TweetResults.ToTweetTrove(ignore_null_entries)
ret, err := e.Content.ItemContent.TweetResults.ToTweetTrove()
if errors.Is(err, ErrorIsTombstone) {
// Handle tombstones
ret = NewTweetTrove() // clear the result just in case
tombstoned_tweet := e.Content.ItemContent.TweetResults.Result.Legacy.APITweet // Will be empty to start
// Capture the tombstone text
var is_ok bool
tombstoned_tweet.TombstoneText, is_ok = tombstone_types[e.Content.ItemContent.TweetResults.Result.Tombstone.Text.Text]
if !is_ok {
panic(fmt.Errorf(
"Unknown tombstone text %q:\n %w",
e.Content.ItemContent.TweetResults.Result.Tombstone.Text.Text,
EXTERNAL_API_ERROR,
))
}
// Capture the tombstone ID
tombstoned_tweet.ID = int64(int_or_panic(strings.Split(e.EntryID, "-")[1]))
// Parse the tombstone into a Tweet and add it to the trove
parsed_tombstone_tweet, err := ParseSingleTweet(tombstoned_tweet)
if err != nil {
panic(err)
}
ret.Tweets[parsed_tombstone_tweet.ID] = parsed_tombstone_tweet
}
return ret
}
panic("Unknown EntryType: " + e.Content.EntryType)
}
@ -489,11 +544,58 @@ func (api_response APIV2Response) IsEmpty() bool {
*/
func (api_response APIV2Response) ToTweetTrove() (TweetTrove, error) {
ret := NewTweetTrove()
// Parse all of the entries
for _, entry := range api_response.GetMainInstruction().Entries { // TODO: the second Instruction is the pinned tweet
main_trove := entry.ToTweetTrove(true)
main_trove := entry.ToTweetTrove()
ret.MergeWith(main_trove)
}
// Add in any tombstoned user handles and IDs if possible, by reading from the replies
for _, tweet := range ret.Tweets {
// Skip if it's not a reply (nothing to add)
if tweet.InReplyToID == 0 {
continue
}
// Skip if the replied tweet isn't in the result set (e.g., the reply is a quoted tweet)
replied_tweet, is_ok := ret.Tweets[tweet.InReplyToID]
if !is_ok {
continue
}
// Skip if the replied tweet isn't a stub (it's already filled out)
if !replied_tweet.IsStub {
continue
}
if replied_tweet.ID == 0 {
// Not sure if this can happen. Use a panic to detect if it does so we can analyse
// TODO: make a better system to capture "discovery panics" that doesn't involve panicking
panic(fmt.Sprintf("Tombstoned tweet has no ID (should be %d)", tweet.InReplyToID))
}
if replied_tweet.UserID == 0 {
replied_tweet.UserID = tweet.in_reply_to_user_id
if replied_tweet.UserID == 0 { // Still??
log.Warn(fmt.Sprintf("Still couldn't find user for replied tweet %d", tweet.InReplyToID))
continue
}
} // replied_tweet.UserID should now be a real UserID
existing_user, is_ok := ret.Users[replied_tweet.UserID]
if !is_ok {
existing_user = User{ID: replied_tweet.UserID}
}
if existing_user.Handle == "" {
existing_user.Handle = tweet.in_reply_to_user_handle
}
ret.Users[replied_tweet.UserID] = existing_user
// TODO: add to ret.TombstonedUsers?
ret.Tweets[replied_tweet.ID] = replied_tweet
}
return ret, nil // TODO: This doesn't need to return an error, it's always nil
}

View File

@ -63,7 +63,8 @@ func TestAPIV2ParseTweet(t *testing.T) {
err = json.Unmarshal(data, &tweet_result)
assert.NoError(err)
trove := tweet_result.ToTweetTrove(true)
trove, err := tweet_result.ToTweetTrove()
assert.NoError(err)
assert.Equal(1, len(trove.Tweets))
tweet, ok := trove.Tweets[1485708879174508550]
@ -111,7 +112,8 @@ func TestAPIV2ParseTweetWithQuotedTweet(t *testing.T) {
err = json.Unmarshal(data, &tweet_result)
assert.NoError(err)
trove := tweet_result.ToTweetTrove(true)
trove, err := tweet_result.ToTweetTrove()
assert.NoError(err)
// Should be 2 tweets: quote-tweet and quoted-tweet
assert.Equal(2, len(trove.Tweets))
@ -165,7 +167,8 @@ func TestAPIV2ParseRetweet(t *testing.T) {
err = json.Unmarshal(data, &tweet_result)
assert.NoError(err)
trove := tweet_result.ToTweetTrove(true)
trove, err := tweet_result.ToTweetTrove()
assert.NoError(err)
// Should only be 1 tweet, the retweeted one
assert.Equal(1, len(trove.Tweets))
@ -224,7 +227,8 @@ func TestAPIV2ParseRetweetedQuoteTweet(t *testing.T) {
err = json.Unmarshal(data, &tweet_result)
assert.NoError(err)
trove := tweet_result.ToTweetTrove(true)
trove, err := tweet_result.ToTweetTrove()
assert.NoError(err)
// Quoted tweet and quoting tweet
assert.Equal(2, len(trove.Tweets))
@ -283,7 +287,8 @@ func TestAPIV2ParseTweetWithQuotedTombstone(t *testing.T) {
err = json.Unmarshal(data, &tweet_result)
assert.NoError(err)
trove := tweet_result.ToTweetTrove(true)
trove, err := tweet_result.ToTweetTrove()
assert.NoError(err)
assert.Equal(1, len(trove.Users))
user, ok := trove.Users[44067298]
@ -318,7 +323,8 @@ func TestAPIV2ParseTweetWithURL(t *testing.T) {
err = json.Unmarshal(data, &tweet_result)
assert.NoError(err)
trove := tweet_result.ToTweetTrove(true)
trove, err := tweet_result.ToTweetTrove()
assert.NoError(err)
assert.Equal(1, len(trove.Tweets))
tweet, ok := trove.Tweets[1485695695025803264]
@ -355,7 +361,8 @@ func TestAPIV2ParseTweetWithURLPlayerCard(t *testing.T) {
err = json.Unmarshal(data, &tweet_result)
assert.NoError(err)
trove := tweet_result.ToTweetTrove(true)
trove, err := tweet_result.ToTweetTrove()
assert.NoError(err)
assert.Equal(1, len(trove.Tweets))
tweet, ok := trove.Tweets[1485504913614327808]
@ -387,7 +394,8 @@ func TestAPIV2ParseTweetWithURLRetweet(t *testing.T) {
err = json.Unmarshal(data, &tweet_result)
assert.NoError(err)
trove := tweet_result.ToTweetTrove(true)
trove, err := tweet_result.ToTweetTrove()
assert.NoError(err)
assert.Equal(1, len(trove.Tweets))
tweet, ok := trove.Tweets[1488605073588559873]
@ -414,7 +422,8 @@ func TestAPIV2ParseTweetWithPoll(t *testing.T) {
err = json.Unmarshal(data, &tweet_result)
assert.NoError(err)
trove := tweet_result.ToTweetTrove(true)
trove, err := tweet_result.ToTweetTrove()
assert.NoError(err)
assert.Len(trove.Tweets, 1)
tweet, ok := trove.Tweets[1485692111106285571]
@ -454,7 +463,8 @@ func TestAPIV2ParseTweetWithSpace(t *testing.T) {
err = json.Unmarshal(data, &tweet_result)
assert.NoError(err)
trove := tweet_result.ToTweetTrove(true)
trove, err := tweet_result.ToTweetTrove()
assert.NoError(err)
assert.Len(trove.Tweets, 1)
tweet, ok := trove.Tweets[1497647006445146113]
@ -580,26 +590,68 @@ func TestAPIV2GetMainInstructionFromFeed(t *testing.T) {
assert.Equal(feed.GetMainInstruction().Entries[41].EntryID, "asdf")
}
/**
* Should handle an entry in the feed that's a tombstone by just ignoring it
* Expectation: random tombstones in the feed with no context should parse as empty TweetTroves.
*
* The indication that it's from a feed (i.e., not in a comments thread) is 'ToTweetTrove(true)'.
* On a reply thread, it would be 'ToTweetTrove(false)'.
*/
func TestAPIV2TombstoneEntry(t *testing.T) {
assert := assert.New(t)
data, err := os.ReadFile("test_responses/api_v2/tombstone_tweet.json")
require.NoError(t, err)
var tweet_result APIV2Result
err = json.Unmarshal(data, &tweet_result)
var entry APIV2Entry
err = json.Unmarshal(data, &entry)
require.NoError(t, err)
trove := tweet_result.ToTweetTrove(true) // 'true' indicates to ignore empty entries
assert.Len(trove.Tweets, 0)
trove := entry.ToTweetTrove()
assert.NoError(err)
assert.Len(trove.Tweets, 1)
assert.Len(trove.Users, 0)
assert.Len(trove.Retweets, 0)
tweet, is_ok := trove.Tweets[1454515503242829830]
assert.True(is_ok)
assert.Equal(tweet.ID, TweetID(1454515503242829830))
}
func TestAPIV2ConversationThreadWithTombstones(t *testing.T) {
assert := assert.New(t)
data, err := os.ReadFile("test_responses/api_v2/conversation_thread_with_tombstones.json")
require.NoError(t, err)
var resp APIV2Response
err = json.Unmarshal(data, &resp)
require.NoError(t, err)
trove, err := resp.ToTweetTrove()
require.NoError(t, err)
assert.Len(trove.Tweets, 4)
// t1, is_ok := trove.Tweets[1454515503242829830]
// assert.True(is_ok)
// assert.True(t1.IsStub)
// assert.Equal(TweetID(0), t1.InReplyToID)
// // TODO: assert associated user is fake
// t2, is_ok := trove.Tweets[1454521424144654344]
// assert.True(is_ok)
// assert.True(t2.IsStub)
// assert.Equal(TweetID(1454515503242829830), t2.InReplyToID)
t3, is_ok := trove.Tweets[1454522147750260742]
assert.True(is_ok)
assert.True(t3.IsStub)
// assert.Equal(TweetID(1454521424144654344), t3.InReplyToID)
assert.Equal(UserID(1365863538393309184), t3.UserID)
t3_user, is_ok := trove.Users[t3.UserID]
assert.True(is_ok)
assert.Equal(UserHandle("itsbackwereover"), t3_user.Handle)
t4, is_ok := trove.Tweets[1454526270809726977]
assert.True(is_ok)
assert.False(t4.IsStub)
assert.Equal(TweetID(1454522147750260742), t4.InReplyToID)
_, is_ok = trove.Users[t4.UserID]
assert.True(is_ok)
// assert.Len(trove.Users, 4)
assert.Len(trove.Retweets, 0)
}
func TestTweetWithWarning(t *testing.T) {
@ -610,7 +662,8 @@ func TestTweetWithWarning(t *testing.T) {
err = json.Unmarshal(data, &tweet_result)
require.NoError(t, err)
trove := tweet_result.ToTweetTrove(true)
trove, err := tweet_result.ToTweetTrove()
assert.NoError(err)
assert.Len(trove.Retweets, 1)
assert.Len(trove.Tweets, 2)
@ -626,7 +679,8 @@ func TestRetweetWithVisibilityResults(t *testing.T) {
err = json.Unmarshal(data, &tweet_result)
require.NoError(err)
trove := tweet_result.ToTweetTrove(true)
trove, err := tweet_result.ToTweetTrove()
assert.NoError(err)
assert.Len(trove.Retweets, 1)
assert.Len(trove.Tweets, 1)
@ -646,7 +700,8 @@ func TestExpandableTweet(t *testing.T) {
err = json.Unmarshal(data, &tweet_result)
require.NoError(err)
trove := tweet_result.ToTweetTrove(true)
trove, err := tweet_result.ToTweetTrove()
assert.NoError(err)
main_tweet, is_ok := trove.Tweets[TweetID(1649600354747572225)]
require.True(is_ok)
@ -665,7 +720,7 @@ func TestEntryWithConversationThread(t *testing.T) {
err = json.Unmarshal(data, &entry_result)
require.NoError(err)
trove := entry_result.ToTweetTrove(true)
trove := entry_result.ToTweetTrove()
assert.Len(trove.Tweets, 4) // 3 tweets in the thread plus the quoted tweet
t1, is_ok := trove.Tweets[1624966566264680448]
@ -693,7 +748,7 @@ func TestConversationThreadEntryWithShowMoreButton(t *testing.T) {
err = json.Unmarshal(data, &entry_result)
require.NoError(err)
trove := entry_result.ToTweetTrove(true)
trove := entry_result.ToTweetTrove()
assert.Len(trove.Tweets, 1)
t1, is_ok := trove.Tweets[1649803385485377536]

View File

@ -1 +1 @@
{"result":{"__typename":"TweetTombstone","tombstone":{"__typename":"TextTombstone","text":{"rtl":false,"text":"This Tweet was deleted by the Tweet author. Learn more","entities":[{"fromIndex":44,"toIndex":54,"ref":{"type":"TimelineUrl","url":"https://help.twitter.com/rules-and-policies/notices-on-twitter","urlType":"ExternalUrl"}}]}}}}
{"entryId":"tweet-1454515503242829830","sortIndex":"7768856533611945977","content":{"entryType":"TimelineTimelineItem","__typename":"TimelineTimelineItem","itemContent":{"itemType":"TimelineTweet","__typename":"TimelineTweet","tweet_results":{"result":{"__typename":"TweetTombstone","tombstone":{"__typename":"TextTombstone","text":{"rtl":false,"text":"Youre unable to view this Tweet because this account owner limits who can view their Tweets. Learn more","entities":[{"fromIndex":94,"toIndex":104,"ref":{"type":"TimelineUrl","url":"https://help.twitter.com/rules-and-policies/notices-on-twitter","urlType":"ExternalUrl"}}]}}}},"tweetDisplayType":"Tweet","hasModeratedReplies":false}}}

View File

@ -33,9 +33,8 @@ func (l CommaSeparatedList) Value() (driver.Value, error) {
}
type Tweet struct {
ID TweetID `db:"id"`
UserID UserID `db:"user_id"`
UserHandle UserHandle // For processing tombstones
ID TweetID `db:"id"`
UserID UserID `db:"user_id"`
User *User
Text string `db:"text"`
IsExpandable bool `db:"is_expandable"`
@ -47,6 +46,11 @@ type Tweet struct {
InReplyToID TweetID `db:"in_reply_to_id"`
QuotedTweetID TweetID `db:"quoted_tweet_id"`
// For processing tombstones
UserHandle UserHandle
in_reply_to_user_handle UserHandle
in_reply_to_user_id UserID
Images []Image
Videos []Video
Urls []Url
@ -221,6 +225,10 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
ret.LastScrapedAt = TimestampFromUnix(0) // Caller will change this for the tweet that was actually scraped
ret.IsConversationScraped = false // Safe due to the "No Worsening" principle
// Extra data that can help piece together tombstoned tweet info
ret.in_reply_to_user_id = UserID(apiTweet.InReplyToUserID)
ret.in_reply_to_user_handle = UserHandle(apiTweet.InReplyToScreenName)
return
}