Scraper now handles implicit tombstones like '"quoted_status_result": {}'
This commit is contained in:
parent
535f28c278
commit
63ddaaeafb
@ -143,14 +143,15 @@ func (u APIV2UserResult) ToUser() User {
|
|||||||
return user
|
return user
|
||||||
}
|
}
|
||||||
|
|
||||||
type _Result struct {
|
type Tombstone struct {
|
||||||
ID int64 `json:"rest_id,string"`
|
|
||||||
Legacy APIV2Tweet `json:"legacy"`
|
|
||||||
Tombstone *struct {
|
|
||||||
Text struct {
|
Text struct {
|
||||||
Text string `json:"text"`
|
Text string `json:"text"`
|
||||||
} `json:"text"`
|
} `json:"text"`
|
||||||
} `json:"tombstone"`
|
}
|
||||||
|
type _Result struct {
|
||||||
|
ID int64 `json:"rest_id,string"`
|
||||||
|
Legacy APIV2Tweet `json:"legacy"`
|
||||||
|
Tombstone *Tombstone `json:"tombstone"`
|
||||||
Core *APIV2UserResult `json:"core"`
|
Core *APIV2UserResult `json:"core"`
|
||||||
Card APIV2Card `json:"card"`
|
Card APIV2Card `json:"card"`
|
||||||
QuotedStatusResult *APIV2Result `json:"quoted_status_result"`
|
QuotedStatusResult *APIV2Result `json:"quoted_status_result"`
|
||||||
@ -213,6 +214,16 @@ func (api_result APIV2Result) ToTweetTrove() (TweetTrove, error) {
|
|||||||
quoted_api_result := api_result.Result.QuotedStatusResult
|
quoted_api_result := api_result.Result.QuotedStatusResult
|
||||||
quoted_trove, err := quoted_api_result.ToTweetTrove()
|
quoted_trove, err := quoted_api_result.ToTweetTrove()
|
||||||
|
|
||||||
|
// Handle `"quoted_status_result": {}` results
|
||||||
|
if errors.Is(err, ERR_NO_TWEET) {
|
||||||
|
// Replace it with a tombstone
|
||||||
|
err = ErrorIsTombstone
|
||||||
|
if quoted_api_result.Result.Tombstone == nil {
|
||||||
|
quoted_api_result.Result.Tombstone = &Tombstone{}
|
||||||
|
}
|
||||||
|
quoted_api_result.Result.Tombstone.Text.Text = "This Post is unavailable. Learn more"
|
||||||
|
}
|
||||||
|
|
||||||
// Quoted tombstones can be handled here since we already have the ID and user handle
|
// Quoted tombstones can be handled here since we already have the ID and user handle
|
||||||
if errors.Is(err, ErrorIsTombstone) {
|
if errors.Is(err, ErrorIsTombstone) {
|
||||||
tombstoned_tweet := quoted_api_result.Result.Legacy.APITweet
|
tombstoned_tweet := quoted_api_result.Result.Legacy.APITweet
|
||||||
@ -256,9 +267,9 @@ func (api_result APIV2Result) ToTweetTrove() (TweetTrove, error) {
|
|||||||
if api_result.Result.Legacy.RetweetedStatusResult == nil {
|
if api_result.Result.Legacy.RetweetedStatusResult == nil {
|
||||||
// We have to filter out retweets. For some reason, retweets have a copy of the card in both the retweeting
|
// We have to filter out retweets. For some reason, retweets have a copy of the card in both the retweeting
|
||||||
// and the retweeted TweetResults; it should only be parsed for the real Tweet, not the Retweet
|
// and the retweeted TweetResults; it should only be parsed for the real Tweet, not the Retweet
|
||||||
main_tweet, is_ok := ret.Tweets[TweetID(api_result.Result.Legacy.ID)]
|
main_tweet, is_ok := ret.Tweets[TweetID(api_result.Result.ID)]
|
||||||
if !is_ok {
|
if !is_ok {
|
||||||
panic(fmt.Errorf("Tweet trove didn't contain its own tweet with ID %d:\n %w", api_result.Result.Legacy.ID, EXTERNAL_API_ERROR))
|
return TweetTrove{}, ERR_NO_TWEET
|
||||||
}
|
}
|
||||||
if api_result.Result.Card.Legacy.Name == "summary_large_image" || api_result.Result.Card.Legacy.Name == "player" {
|
if api_result.Result.Card.Legacy.Name == "summary_large_image" || api_result.Result.Card.Legacy.Name == "player" {
|
||||||
url := api_result.Result.Card.ParseAsUrl()
|
url := api_result.Result.Card.ParseAsUrl()
|
||||||
@ -595,9 +606,7 @@ func (api_response APIV2Response) GetCursorBottom() string {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// Returns `true` if there's no non-cursor entries in this response, false otherwise
|
||||||
* Returns `true` if there's no non-cursor entries in this response, false otherwise
|
|
||||||
*/
|
|
||||||
func (api_response APIV2Response) IsEmpty() bool {
|
func (api_response APIV2Response) IsEmpty() bool {
|
||||||
for _, e := range api_response.GetMainInstruction().Entries {
|
for _, e := range api_response.GetMainInstruction().Entries {
|
||||||
if !strings.Contains(e.EntryID, "cursor") {
|
if !strings.Contains(e.EntryID, "cursor") {
|
||||||
@ -607,9 +616,7 @@ func (api_response APIV2Response) IsEmpty() bool {
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// Parse the collected API response and turn it into a TweetTrove
|
||||||
* Parse the collected API response and turn it into a TweetTrove
|
|
||||||
*/
|
|
||||||
func (api_response APIV2Response) ToTweetTrove() (TweetTrove, error) {
|
func (api_response APIV2Response) ToTweetTrove() (TweetTrove, error) {
|
||||||
ret := NewTweetTrove()
|
ret := NewTweetTrove()
|
||||||
|
|
||||||
|
@ -595,7 +595,6 @@ func TestAPIV2UserFeedTombstoneEntry(t *testing.T) {
|
|||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
trove := entry.ToTweetTrove()
|
trove := entry.ToTweetTrove()
|
||||||
assert.NoError(err)
|
|
||||||
// assert.Len(trove.Tweets, 1)
|
// assert.Len(trove.Tweets, 1)
|
||||||
// assert.Len(trove.Users, 1)
|
// assert.Len(trove.Users, 1)
|
||||||
assert.Len(trove.Retweets, 0)
|
assert.Len(trove.Retweets, 0)
|
||||||
@ -917,3 +916,25 @@ func TestParseResultAsLikes(t *testing.T) {
|
|||||||
assert.True(is_ok, "Like (%#v) didn't have its Tweet in the trove", l)
|
assert.True(is_ok, "Like (%#v) didn't have its Tweet in the trove", l)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestTweetWithImplicitQuotedTombstone(t *testing.T) {
|
||||||
|
assert := assert.New(t)
|
||||||
|
require := require.New(t)
|
||||||
|
data, err := os.ReadFile("test_responses/api_v2/tweet_with_implicit_quoted_tombstone.json")
|
||||||
|
require.NoError(err)
|
||||||
|
var entry_result APIV2Entry
|
||||||
|
err = json.Unmarshal(data, &entry_result)
|
||||||
|
require.NoError(err)
|
||||||
|
|
||||||
|
trove := entry_result.ToTweetTrove()
|
||||||
|
|
||||||
|
assert.Len(trove.Tweets, 2)
|
||||||
|
|
||||||
|
t1, is_ok := trove.Tweets[TweetID(1586033916367904768)]
|
||||||
|
assert.True(is_ok)
|
||||||
|
assert.False(t1.IsStub)
|
||||||
|
t2, is_ok := trove.Tweets[TweetID(1586033437806305280)]
|
||||||
|
assert.True(is_ok)
|
||||||
|
assert.True(t2.IsStub)
|
||||||
|
assert.Equal(t2.TombstoneType, "unavailable")
|
||||||
|
}
|
||||||
|
@ -0,0 +1 @@
|
|||||||
|
{"entryId":"tweet-1586033916367904768","sortIndex":"1713285565408870280","content":{"entryType":"TimelineTimelineItem","__typename":"TimelineTimelineItem","itemContent":{"itemType":"TimelineTweet","__typename":"TimelineTweet","tweet_results":{"result":{"__typename":"Tweet","rest_id":"1586033916367904768","core":{"user_results":{"result":{"__typename":"User","id":"VXNlcjo0NDA2NzI5OA==","rest_id":"44067298","affiliates_highlighted_label":{},"has_graduated_access":true,"is_blue_verified":true,"profile_image_shape":"Circle","legacy":{"blocked_by":false,"blocking":false,"follow_request_sent":false,"followed_by":false,"following":false,"muting":false,"notifications":false,"protected":false,"can_dm":false,"can_media_tag":false,"created_at":"Tue Jun 02 05:35:52 +0000 2009","default_profile":false,"default_profile_image":false,"description":"Author: Dear Reader, The New Right, The Anarchist Handbook & The White Pill \nHost: \"YOUR WELCOME\" \nSubject: Ego & Hubris by Harvey Pekar\nHe/Him ⚑","entities":{"description":{"urls":[]},"url":{"urls":[{"display_url":"amzn.to/3oInafv","expanded_url":"https://amzn.to/3oInafv","url":"https://t.co/7VDFOOtFK2","indices":[0,23]}]}},"fast_followers_count":0,"favourites_count":4831,"followers_count":658993,"friends_count":1040,"has_custom_timelines":true,"is_translator":false,"listed_count":2221,"location":"Austin","media_count":13701,"name":"Michael Malice","normal_followers_count":658993,"pinned_tweet_ids_str":["1712211529905353086"],"possibly_sensitive":false,"profile_banner_url":"https://pbs.twimg.com/profile_banners/44067298/1664774013","profile_image_url_https":"https://pbs.twimg.com/profile_images/1415820415314931715/_VVX4GI8_normal.jpg","profile_interstitial_type":"","screen_name":"michaelmalice","statuses_count":162731,"translator_type":"none","url":"https://t.co/7VDFOOtFK2","verified":false,"want_retweets":false,"withheld_in_countries":[]},"has_nft_avatar":false,"super_follow_eligible":false,"super_followed_by":false,"super_following":false}}},"edit_control":{"edit_tweet_ids":["1586033916367904768"],"editable_until_msecs":"1666976727000","is_edit_eligible":false,"edits_remaining":"5"},"is_translatable":false,"source":"<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>","quoted_status_result":{},"legacy":{"bookmark_count":3,"bookmarked":false,"created_at":"Fri Oct 28 16:35:27 +0000 2022","conversation_id_str":"1586033916367904768","display_text_range":[0,65],"entities":{"user_mentions":[],"urls":[],"hashtags":[],"symbols":[]},"favorite_count":436,"favorited":false,"full_text":"the Constitution does not grant rights, it claims to protect them","is_quote_status":true,"lang":"en","quote_count":1,"quoted_status_id_str":"1586033437806305280","quoted_status_permalink":{"url":"https://t.co/r6ZFruksos","expanded":"https://twitter.com/SloCobruh/status/1586033437806305280","display":"twitter.com/SloCobruh/stat…"},"reply_count":16,"retweet_count":27,"retweeted":false,"user_id_str":"44067298","id_str":"1586033916367904768"}}},"tweetDisplayType":"Tweet","highlights":{"textHighlights":[{"startIndex":4,"endIndex":16}]}},"feedbackInfo":{"feedbackKeys":["1060665035"]},"clientEventInfo":{"component":"result","element":"tweet","details":{"timelinesDetails":{"controllerData":"DAACDAAFDAABDAABDAABCgABAAAAAAAAAAAAAAwAAgoAAQAAAAAAAABACgAC+2hCBb/AmpQKAAVD6ekwp16RtQgABgAAAAAKAAdChvNiX2bzlgAAAAAA"}}}}}
|
@ -131,7 +131,7 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
|||||||
ret.PostedAt, err = TimestampFromString(apiTweet.CreatedAt)
|
ret.PostedAt, err = TimestampFromString(apiTweet.CreatedAt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if ret.ID == 0 {
|
if ret.ID == 0 {
|
||||||
return Tweet{}, fmt.Errorf("unable to parse tweet:\n %w", ERR_NO_TWEET)
|
return Tweet{}, fmt.Errorf("unable to parse tweet: %w", ERR_NO_TWEET)
|
||||||
}
|
}
|
||||||
return Tweet{}, fmt.Errorf("Error parsing time on tweet ID %d:\n %w", ret.ID, err)
|
return Tweet{}, fmt.Errorf("Error parsing time on tweet ID %d:\n %w", ret.ID, err)
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user