From 4708ffc3c9b22c96534fa28f8a7137e2d56310a7 Mon Sep 17 00:00:00 2001 From: Alessio Date: Sun, 30 Jan 2022 17:48:03 -0800 Subject: [PATCH] Intermediate results for refactoring :V --- scraper/api_types_v2.go | 175 ++++++++++++++++++++++----- scraper/api_types_v2_test.go | 226 +++++++++++++++++++++++++++++++++++ 2 files changed, 371 insertions(+), 30 deletions(-) diff --git a/scraper/api_types_v2.go b/scraper/api_types_v2.go index 78c8091..e8cfea0 100644 --- a/scraper/api_types_v2.go +++ b/scraper/api_types_v2.go @@ -27,36 +27,91 @@ func (u APIV2UserResult) ToUser() User { return user } +type APIV2Result struct { + Result struct { + ID int64 `json:"rest_id,string"` + Legacy APIV2Tweet `json:"legacy"` + Tombstone *struct { + Text struct { + Text string `json:"text"` + } `json:"text"` + } `json:"tombstone"` + Core *APIV2UserResult `json:"core"` + QuotedStatusResult *APIV2Result `json:"quoted_status_result"` + } `json:"result"` +} +func (api_result APIV2Result) ToTweetTrove() TweetTrove { + ret := NewTweetTrove() + + if api_result.Result.Core != nil { + main_user := api_result.Result.Core.ToUser() + ret.Users[main_user.ID] = main_user + } else { + // TODO + } + + main_tweet_trove := api_result.Result.Legacy.ToTweetTrove() + ret.MergeWith(main_tweet_trove) + + // Handle quoted tweet + if api_result.Result.QuotedStatusResult != nil { + quoted_api_result := api_result.Result.QuotedStatusResult + + // Quoted tweets might be tombstones! + if quoted_api_result.Result.Tombstone != nil { + tombstoned_tweet := "ed_api_result.Result.Legacy.APITweet + tombstoned_tweet.TombstoneText = quoted_api_result.Result.Tombstone.Text.Text + tombstoned_tweet.ID = int64(int_or_panic(api_result.Result.Legacy.APITweet.QuotedStatusIDStr)) + handle, err := ParseHandleFromTweetUrl(api_result.Result.Legacy.APITweet.QuotedStatusPermalink.ExpandedURL) + if err != nil { + panic(err) + } + tombstoned_tweet.UserHandle = string(handle) + ret.TombstoneUsers = append(ret.TombstoneUsers, handle) + } + + quoted_trove := api_result.Result.QuotedStatusResult.ToTweetTrove() + ret.MergeWith(quoted_trove) + } + + return ret +} + type APIV2Tweet struct { + // For some reason, retweets are nested *inside* the Legacy tweet, whereas + // quoted-tweets are next to it, as their own tweet + RetweetedStatusResult *APIV2Result `json:"retweeted_status_result"` APITweet - RetweetedStatusResult struct { - Result struct { - ID int `json:"rest_id,string"` - Legacy APITweet `json:"legacy"` - Core struct { - UserResults struct { - Result struct { - ID int64 `json:"rest_id,string"` - Legacy APIUser `json:"legacy"` - } `json:"result"` - } `json:"user_results"` - } `json:"core"` - QuotedStatusResult struct { - Result struct { - ID int64 `json:"rest_id,string"` - Legacy APITweet `json:"legacy"` - Core struct { - UserResults struct { - Result struct { - ID int64 `json:"rest_id,string"` - Legacy APIUser `json:"legacy"` - } `json:"result"` - } `json:"user_results"` - } `json:"core"` - } `json:"result"` - } `json:"quoted_status_result"` - } `json:"result"` - } `json:"retweeted_status_result"` +} +func (api_v2_tweet APIV2Tweet) ToTweetTrove() TweetTrove { + ret := NewTweetTrove() + + // If there's a retweet, we ignore the main tweet except for posted_at and retweeting UserID + if api_v2_tweet.RetweetedStatusResult != nil { + orig_tweet_trove := api_v2_tweet.RetweetedStatusResult.ToTweetTrove() + ret.MergeWith(orig_tweet_trove) + + + retweet := Retweet{} + var err error + retweet.RetweetID = TweetID(api_v2_tweet.ID) + retweet.TweetID = TweetID(api_v2_tweet.RetweetedStatusResult.Result.ID) + retweet.RetweetedByID = UserID(api_v2_tweet.APITweet.UserID) + retweet.RetweetedAt, err = time.Parse(time.RubyDate, api_v2_tweet.APITweet.CreatedAt) + if err != nil { + fmt.Printf("%v\n", api_v2_tweet) + panic(err) + } + ret.Retweets[retweet.RetweetID] = retweet + } else { + main_tweet, err := ParseSingleTweet(api_v2_tweet.APITweet) + if err != nil { + panic(err) + } + ret.Tweets[main_tweet.ID] = main_tweet + } + + return ret } type APIV2Response struct { @@ -75,7 +130,37 @@ type APIV2Response struct { EntryType string `json:"entryType"` TweetResults struct { Result struct { - Legacy APIV2Tweet `json:"legacy"` + Legacy struct { + APITweet + RetweetedStatusResult struct { + Result struct { + ID int `json:"rest_id,string"` + Legacy APITweet `json:"legacy"` + Core struct { + UserResults struct { + Result struct { + ID int64 `json:"rest_id,string"` + Legacy APIUser `json:"legacy"` + } `json:"result"` + } `json:"user_results"` + } `json:"core"` + QuotedStatusResult struct { + Result struct { + ID int64 `json:"rest_id,string"` + Legacy APITweet `json:"legacy"` + Core struct { + UserResults struct { + Result struct { + ID int64 `json:"rest_id,string"` + Legacy APIUser `json:"legacy"` + } `json:"result"` + } `json:"user_results"` + } `json:"core"` + } `json:"result"` + } `json:"quoted_status_result"` + } `json:"result"` + } `json:"retweeted_status_result"` + } `json:"legacy"` Core struct { UserResults struct { Result struct { @@ -87,7 +172,37 @@ type APIV2Response struct { QuotedStatusResult struct { // Same as "Result" Result struct { ID int64 `json:"rest_id,string"` - Legacy APIV2Tweet `json:"legacy"` + Legacy struct { + APITweet + RetweetedStatusResult struct { + Result struct { + ID int `json:"rest_id,string"` + Legacy APITweet `json:"legacy"` + Core struct { + UserResults struct { + Result struct { + ID int64 `json:"rest_id,string"` + Legacy APIUser `json:"legacy"` + } `json:"result"` + } `json:"user_results"` + } `json:"core"` + QuotedStatusResult struct { + Result struct { + ID int64 `json:"rest_id,string"` + Legacy APITweet `json:"legacy"` + Core struct { + UserResults struct { + Result struct { + ID int64 `json:"rest_id,string"` + Legacy APIUser `json:"legacy"` + } `json:"result"` + } `json:"user_results"` + } `json:"core"` + } `json:"result"` + } `json:"quoted_status_result"` + } `json:"result"` + } `json:"retweeted_status_result"` + } `json:"legacy"` Core struct { UserResults struct { Result struct { diff --git a/scraper/api_types_v2_test.go b/scraper/api_types_v2_test.go index 4259d11..c70647a 100644 --- a/scraper/api_types_v2_test.go +++ b/scraper/api_types_v2_test.go @@ -10,6 +10,7 @@ import ( "github.com/stretchr/testify/assert" ) + /** * Parse an APIV2User */ @@ -46,6 +47,231 @@ func TestAPIV2ParseUser(t *testing.T) { assert.Equal(user.PinnedTweetID, TweetID(1477347403023982596)) } + +/** + * Parse a plain text tweet + */ +func TestAPIV2ParseTweet(t *testing.T) { + data, err := ioutil.ReadFile("test_responses/api_v2/tweet_plaintext.json") + if err != nil { + panic(err) + } + assert := assert.New(t) + + var tweet_result APIV2Result + err = json.Unmarshal(data, &tweet_result) + assert.NoError(err) + + trove := tweet_result.ToTweetTrove() + + assert.Equal(1, len(trove.Tweets)) + tweet, ok := trove.Tweets[1485708879174508550] + assert.True(ok) + assert.Equal(tweet.ID, TweetID(1485708879174508550)) + assert.Equal(tweet.UserID, UserID(44067298)) + assert.Equal(tweet.Text, "If Boris Johnson is driven out of office, it wouldn't mark the first time the Tories had four PMs in a row\nThey had previously governed the UK for 13 years with 4 PMs, from 1951-1964") + assert.Equal(tweet.PostedAt.Unix(), int64(1643055574)) + assert.Equal(tweet.QuotedTweetID, TweetID(0)) + assert.Equal(tweet.InReplyToID, TweetID(0)) + assert.Equal(tweet.NumLikes, 38) + assert.Equal(tweet.NumRetweets, 2) + assert.Equal(tweet.NumReplies, 2) + assert.Equal(tweet.NumQuoteTweets, 1) + assert.Equal(0, len(tweet.Images)) + assert.Equal(0, len(tweet.Videos)) + assert.Equal(0, len(tweet.Polls)) + assert.Equal(0, len(tweet.Mentions)) + assert.Equal(0, len(tweet.ReplyMentions)) + assert.Equal(0, len(tweet.Hashtags)) + assert.Equal(0, len(tweet.Polls)) + assert.Equal("", tweet.TombstoneType) + assert.False(tweet.IsStub) + + assert.Equal(1, len(trove.Users)) + user, ok := trove.Users[44067298] + assert.True(ok) + assert.Equal(UserID(44067298), user.ID) + assert.Equal(UserHandle("michaelmalice"), user.Handle) + + assert.Equal(0, len(trove.Retweets)) +} + + +/** + * Parse a tweet with a quoted tweet + */ +func TestAPIV2ParseTweetWithQuotedTweet(t *testing.T) { + assert := assert.New(t) + data, err := ioutil.ReadFile("test_responses/api_v2/tweet_with_quoted_tweet.json") + if err != nil { + panic(err) + } + + var tweet_result APIV2Result + err = json.Unmarshal(data, &tweet_result) + assert.NoError(err) + + trove := tweet_result.ToTweetTrove() + + // Should be 2 tweets: quote-tweet and quoted-tweet + assert.Equal(2, len(trove.Tweets)) + + quoted_tweet, ok := trove.Tweets[1485690069079846915] + assert.True(ok) + assert.Equal(TweetID(1485690069079846915), quoted_tweet.ID) + assert.Equal(UserID(892155218292617217), quoted_tweet.UserID) + assert.Equal("The Left hates the Right so much that they won't let them leave the Union. I don't get it.", quoted_tweet.Text) + assert.Equal(int64(1643051089), quoted_tweet.PostedAt.Unix()) + assert.Equal(TweetID(1485689207435710464), quoted_tweet.InReplyToID) + assert.Equal(TweetID(0), quoted_tweet.QuotedTweetID) + assert.Equal(1, len(quoted_tweet.ReplyMentions)) + assert.Contains(quoted_tweet.ReplyMentions, UserHandle("michaelmalice")) + assert.Equal(1, quoted_tweet.NumReplies) + assert.Equal(12, quoted_tweet.NumLikes) + + quote_tweet, ok := trove.Tweets[1485690410899021826] + assert.True(ok) + assert.Equal(TweetID(1485690410899021826), quote_tweet.ID) + assert.Equal(TweetID(1485690069079846915), quote_tweet.QuotedTweetID) + assert.Equal("Hatred is powerless in and of itself despite all the agitprop to the contrary\nHatred didnt stop Trump's election, for example", quote_tweet.Text) + + // Should be 2 users: quoter and quoted + assert.Equal(2, len(trove.Users)) + + user_quoting, ok := trove.Users[44067298] + assert.True(ok) + assert.Equal(UserHandle("michaelmalice"), user_quoting.Handle) + + user_quoted, ok := trove.Users[892155218292617217] + assert.True(ok) + assert.Equal(UserHandle("baalzimon"), user_quoted.Handle) + + // No retweets + assert.Equal(0, len(trove.Retweets)) +} + + +/** + * Parse a retweet + */ +func TestAPIV2ParseRetweet(t *testing.T) { + assert := assert.New(t) + data, err := ioutil.ReadFile("test_responses/api_v2/retweet.json") + if err != nil { + panic(err) + } + + var tweet_result APIV2Result + err = json.Unmarshal(data, &tweet_result) + assert.NoError(err) + + trove := tweet_result.ToTweetTrove() + + // Should only be 1 tweet, the retweeted one + assert.Equal(1, len(trove.Tweets)) + tweet, ok := trove.Tweets[1485694028620316673] + assert.True(ok) + assert.Equal(TweetID(1485694028620316673), tweet.ID) + assert.Equal(UserID(1326229737551912960), tweet.UserID) + assert.Equal("More mask madness, this time in an elevator. The mask police are really nuts https://t.co/3BpvLjdJwD", tweet.Text) + assert.Equal(int64(1643052033), tweet.PostedAt.Unix()) + assert.Equal(5373, tweet.NumLikes) + assert.Equal(TweetID(0), tweet.InReplyToID) + assert.Equal(1, len(tweet.Videos)) + + // Check the video + v := tweet.Videos[0] + assert.Equal("https://pbs.twimg.com/ext_tw_video_thumb/1485627274594590721/pu/img/O6mMKrsqWl8WcMy1.jpg", v.ThumbnailRemoteUrl) + assert.Equal(0, v.ViewCount) // TODO: make this work + assert.Equal(720, v.Height) + assert.Equal(720, v.Width) + assert.Equal(30066, v.Duration) + + // Should fetch both the retweeting and retweeted users + assert.Equal(2, len(trove.Users)) + + retweeted_user, ok := trove.Users[1326229737551912960] + assert.True(ok) + assert.Equal(UserID(1326229737551912960), retweeted_user.ID) + assert.Equal(UserHandle("libsoftiktok"), retweeted_user.Handle) + + retweeting_user, ok := trove.Users[44067298] + assert.True(ok) + assert.Equal(UserID(44067298), retweeting_user.ID) + assert.Equal(UserHandle("michaelmalice"), retweeting_user.Handle) + + + // Should be 1 retweet + assert.Equal(1, len(trove.Retweets)) + retweet, ok := trove.Retweets[1485699748514476037] + assert.True(ok) + assert.Equal(TweetID(1485699748514476037), retweet.RetweetID) + assert.Equal(TweetID(1485694028620316673), retweet.TweetID) + assert.Equal(int64(1643053397), retweet.RetweetedAt.Unix()) + assert.Equal(UserID(44067298), retweet.RetweetedByID) +} + + +/** + * Parse a retweeted quote tweet + */ +func TestAPIV2ParseRetweetedQuoteTweet(t *testing.T) { + assert := assert.New(t) + data, err := ioutil.ReadFile("test_responses/api_v2/retweet_with_quote_tweet.json") + if err != nil { + panic(err) + } + + var tweet_result APIV2Result + err = json.Unmarshal(data, &tweet_result) + assert.NoError(err) + + trove := tweet_result.ToTweetTrove() + + // Quoted tweet and quoting tweet + assert.Equal(2, len(trove.Tweets)) + quoted_tweet, ok := trove.Tweets[1484900469482962944] + assert.True(ok) + assert.Equal(TweetID(1484900469482962944), quoted_tweet.ID) + assert.Equal(UserID(14347972), quoted_tweet.UserID) + assert.Equal(TweetID(1484643409130397702), quoted_tweet.QuotedTweetID) + + quoting_tweet, ok := trove.Tweets[1485272859102621697] + assert.True(ok) + assert.Equal(TweetID(1485272859102621697), quoting_tweet.ID) + assert.Equal(UserID(1434720042193760256), quoting_tweet.UserID) + assert.Equal(TweetID(1484900469482962944), quoting_tweet.QuotedTweetID) + assert.Equal(200, quoting_tweet.NumLikes) + + // 3 Users: quoted, quoter, and retweeter + assert.Equal(3, len(trove.Users)) + + retweeting_user, ok := trove.Users[599817378] + assert.True(ok) + assert.Equal(UserID(599817378), retweeting_user.ID) + assert.Equal(UserHandle("ScottMGreer"), retweeting_user.Handle) + + retweeted_user, ok := trove.Users[1434720042193760256] + assert.True(ok) + assert.Equal(UserID(1434720042193760256), retweeted_user.ID) + assert.Equal(UserHandle("LatinxPutler"), retweeted_user.Handle) + + quoted_user, ok := trove.Users[14347972] + assert.True(ok) + assert.Equal(UserID(14347972), quoted_user.ID) + assert.Equal(UserHandle("Heminator"), quoted_user.Handle) + + // Should be 1 retweet + assert.Equal(1, len(trove.Retweets)) + retweet, ok := trove.Retweets[1485273090665984000] + assert.True(ok) + assert.Equal(TweetID(1485273090665984000), retweet.RetweetID) + assert.Equal(TweetID(1485272859102621697), retweet.TweetID) + assert.Equal(int64(1642951674), retweet.RetweetedAt.Unix()) + assert.Equal(UserID(599817378), retweet.RetweetedByID) +} + + // Check a plain old tweet func TestAPIV2FeedSimpleTweet(t *testing.T) { data, err := ioutil.ReadFile("test_responses/api_v2/feed_simple_tweet.json")