From bd0e3537719bb99e7e5c954ef9fba9b44e6e87cc Mon Sep 17 00:00:00 2001 From: Alessio Date: Mon, 23 Dec 2024 19:06:29 -0800 Subject: [PATCH] REFACTOR: move a bunch of stuff around in the scraper to consolidate - delete type-specific test files and put them in api_types_test - remove an unneeded test file --- pkg/scraper/api_types.go | 140 +++++--- pkg/scraper/api_types_test.go | 309 ++++++++++++++++++ pkg/scraper/image_test.go | 31 -- pkg/scraper/poll_test.go | 90 ----- pkg/scraper/retweet_test.go | 38 --- .../api_v2/feed_replying_tweet.json | 1 - pkg/scraper/url_test.go | 165 ---------- pkg/scraper/video_test.go | 49 --- 8 files changed, 399 insertions(+), 424 deletions(-) delete mode 100644 pkg/scraper/image_test.go delete mode 100644 pkg/scraper/poll_test.go delete mode 100644 pkg/scraper/retweet_test.go delete mode 100644 pkg/scraper/test_responses/api_v2/feed_replying_tweet.json delete mode 100644 pkg/scraper/url_test.go delete mode 100644 pkg/scraper/video_test.go diff --git a/pkg/scraper/api_types.go b/pkg/scraper/api_types.go index 4ea014c..eb4f47b 100644 --- a/pkg/scraper/api_types.go +++ b/pkg/scraper/api_types.go @@ -14,6 +14,10 @@ import ( "time" ) +// ------------------------------------------------------------------------- +// Image content +// ------------------------------------------------------------------------- + type APIMedia struct { ID int64 `json:"id_str,string"` MediaURLHttps string `json:"media_url_https"` @@ -38,6 +42,10 @@ func ParseAPIMedia(apiMedia APIMedia) Image { } } +// ------------------------------------------------------------------------- +// Video content +// ------------------------------------------------------------------------- + type Variant struct { Bitrate int `json:"bitrate,omitempty"` URL string `json:"url"` @@ -66,6 +74,60 @@ type APIExtendedMedia struct { URL string `json:"url"` // For DM videos } +func ParseAPIVideo(apiVideo APIExtendedMedia) Video { + variants := apiVideo.VideoInfo.Variants + slices.SortFunc(variants, func(a, b Variant) int { return b.Bitrate - a.Bitrate }) + video_remote_url := variants[0].URL + + var view_count int + + r := apiVideo.Ext.MediaStats.R + + switch r.(type) { + case string: + view_count = 0 + case map[string]interface{}: + OK_entry, ok := r.(map[string]interface{})["ok"] + if !ok { + panic("No 'ok' value found in the R!") + } + view_count_str, ok := OK_entry.(map[string]interface{})["viewCount"] + view_count = int_or_panic(view_count_str.(string)) + if !ok { + panic("No 'viewCount' value found in the OK!") + } + } + + video_parsed_url, err := url.Parse(video_remote_url) + if err != nil { + panic(err) + } + + local_filename := get_prefixed_path(path.Base(video_parsed_url.Path)) + + return Video{ + ID: VideoID(apiVideo.ID), + Width: apiVideo.OriginalInfo.Width, + Height: apiVideo.OriginalInfo.Height, + RemoteURL: video_remote_url, + LocalFilename: local_filename, + + ThumbnailRemoteUrl: apiVideo.MediaURLHttps, + ThumbnailLocalPath: get_prefixed_path(path.Base(apiVideo.MediaURLHttps)), + Duration: apiVideo.VideoInfo.Duration, + ViewCount: view_count, + + IsDownloaded: false, + IsBlockedByDMCA: false, + IsGeoblocked: apiVideo.ExtMediaAvailability.Reason == "Geoblocked", + IsGif: apiVideo.Type == "animated_gif", + } +} + +// ------------------------------------------------------------------------- +// Cards: polls and urls +// ------------------------------------------------------------------------- + type APICard struct { Name string `json:"name"` ShortenedUrl string `json:"url"` @@ -194,56 +256,6 @@ func parse_num_choices(card_name string) int { return int_or_panic(card_name[4:5]) } -func ParseAPIVideo(apiVideo APIExtendedMedia) Video { - variants := apiVideo.VideoInfo.Variants - slices.SortFunc(variants, func(a, b Variant) int { return b.Bitrate - a.Bitrate }) - video_remote_url := variants[0].URL - - var view_count int - - r := apiVideo.Ext.MediaStats.R - - switch r.(type) { - case string: - view_count = 0 - case map[string]interface{}: - OK_entry, ok := r.(map[string]interface{})["ok"] - if !ok { - panic("No 'ok' value found in the R!") - } - view_count_str, ok := OK_entry.(map[string]interface{})["viewCount"] - view_count = int_or_panic(view_count_str.(string)) - if !ok { - panic("No 'viewCount' value found in the OK!") - } - } - - video_parsed_url, err := url.Parse(video_remote_url) - if err != nil { - panic(err) - } - - local_filename := get_prefixed_path(path.Base(video_parsed_url.Path)) - - return Video{ - ID: VideoID(apiVideo.ID), - Width: apiVideo.OriginalInfo.Width, - Height: apiVideo.OriginalInfo.Height, - RemoteURL: video_remote_url, - LocalFilename: local_filename, - - ThumbnailRemoteUrl: apiVideo.MediaURLHttps, - ThumbnailLocalPath: get_prefixed_path(path.Base(apiVideo.MediaURLHttps)), - Duration: apiVideo.VideoInfo.Duration, - ViewCount: view_count, - - IsDownloaded: false, - IsBlockedByDMCA: false, - IsGeoblocked: apiVideo.ExtMediaAvailability.Reason == "Geoblocked", - IsGif: apiVideo.Type == "animated_gif", - } -} - func ParseAPIUrlCard(apiCard APICard) Url { values := apiCard.BindingValues ret := Url{} @@ -280,6 +292,10 @@ func ParseAPIUrlCard(apiCard APICard) Url { return ret } +// Some filesystems get slow if the number of items in a directory is very large. To handle this, +// we add a 2 letter directory prefix, based on the first 2 letters of the filename: +// +// e.g., `abcdefg.asdf` => `ab/abcdefg.asdf` func get_prefixed_path(p string) string { local_prefix_regex := regexp.MustCompile(`^[\w-]{2}`) local_prefix := local_prefix_regex.FindString(p) @@ -307,6 +323,10 @@ func get_thumbnail_local_path(remote_url string) string { ) } +// ------------------------------------------------------------------------- +// Individual tweets +// ------------------------------------------------------------------------- + type APITweet struct { ID int64 `json:"id_str,string"` ConversationID int64 `json:"conversation_id_str,string"` @@ -567,6 +587,10 @@ func (t APITweet) String() string { return string(data) } +// ------------------------------------------------------------------------- +// User information +// ------------------------------------------------------------------------- + type APIUser struct { CreatedAt string `json:"created_at"` Description string `json:"description"` @@ -643,6 +667,10 @@ func ParseSingleUser(apiUser APIUser) (ret User, err error) { return } +// ------------------------------------------------------------------------- +// Notifications +// ------------------------------------------------------------------------- + type APINotification struct { ID string `json:"id"` TimestampMs int64 `json:"timestampMs,string"` @@ -674,6 +702,10 @@ type APINotification struct { } `json:"template"` } +// ------------------------------------------------------------------------- +// Metadata object for ordering, which contains implicit data for tombstones +// ------------------------------------------------------------------------- + type APIv1Entry struct { EntryID string `json:"entryId"` SortIndex int64 `json:"sortIndex,string"` @@ -710,6 +742,10 @@ type APIv1Entry struct { func entry_sorting_cmp(a, b APIv1Entry) int { return int(b.SortIndex - a.SortIndex) } +// ------------------------------------------------------------------------- +// Full APIv1 response +// ------------------------------------------------------------------------- + type APIv1Response struct { GlobalObjects struct { Tweets map[string]APITweet `json:"tweets"` @@ -902,6 +938,10 @@ func (t *APIv1Response) ToTweetTrove() (TweetTrove, error) { return ret, nil } +// ------------------------------------------------------------------------- +// Utils +// ------------------------------------------------------------------------- + func idstr_to_int(s string) int64 { return int64(int_or_panic(s)) } diff --git a/pkg/scraper/api_types_test.go b/pkg/scraper/api_types_test.go index 4f0b476..b035b98 100644 --- a/pkg/scraper/api_types_test.go +++ b/pkg/scraper/api_types_test.go @@ -5,6 +5,7 @@ import ( "net/http" "os" "testing" + "time" "github.com/jarcoal/httpmock" "github.com/stretchr/testify/assert" @@ -172,6 +173,277 @@ func TestHandleTombstonesUnavailable(t *testing.T) { } } +func TestParseAPIMedia(t *testing.T) { + assert := assert.New(t) + data, err := os.ReadFile("test_responses/tweet_content/image.json") + if err != nil { + panic(err) + } + var apimedia APIMedia + err = json.Unmarshal(data, &apimedia) + require.NoError(t, err) + + image := ParseAPIMedia(apimedia) + assert.Equal(ImageID(1395882862289772553), image.ID) + assert.Equal("https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg", image.RemoteURL) + assert.Equal(593, image.Width) + assert.Equal(239, image.Height) + assert.Equal("E1/E18sEUrWYAk8dBl.jpg", image.LocalFilename) + assert.False(image.IsDownloaded) +} + +func TestParsePoll2Choices(t *testing.T) { + assert := assert.New(t) + data, err := os.ReadFile("test_responses/tweet_content/poll_card_2_options.json") + if err != nil { + panic(err) + } + var apiCard APICard + err = json.Unmarshal(data, &apiCard) + require.NoError(t, err) + + poll := ParseAPIPoll(apiCard) + assert.Equal(PollID(1457419248461131776), poll.ID) + assert.Equal(2, poll.NumChoices) + assert.Equal(60*60*24, poll.VotingDuration) + assert.Equal(int64(1636397201), poll.VotingEndsAt.Unix()) + assert.Equal(int64(1636318755), poll.LastUpdatedAt.Unix()) + + assert.Less(poll.LastUpdatedAt.Unix(), poll.VotingEndsAt.Unix()) + assert.Equal("Yes", poll.Choice1) + assert.Equal("No", poll.Choice2) + assert.Equal(529, poll.Choice1_Votes) + assert.Equal(2182, poll.Choice2_Votes) +} + +func TestParsePoll4Choices(t *testing.T) { + assert := assert.New(t) + data, err := os.ReadFile("test_responses/tweet_content/poll_card_4_options_ended.json") + if err != nil { + panic(err) + } + var apiCard APICard + err = json.Unmarshal(data, &apiCard) + require.NoError(t, err) + + poll := ParseAPIPoll(apiCard) + assert.Equal(PollID(1455611588854140929), poll.ID) + assert.Equal(4, poll.NumChoices) + assert.Equal(60*60*24, poll.VotingDuration) + assert.Equal(int64(1635966221), poll.VotingEndsAt.Unix()) + assert.Equal(int64(1635966226), poll.LastUpdatedAt.Unix()) + assert.Greater(poll.LastUpdatedAt.Unix(), poll.VotingEndsAt.Unix()) + + assert.Equal("Alec Baldwin", poll.Choice1) + assert.Equal(1669, poll.Choice1_Votes) + + assert.Equal("Andew Cuomo", poll.Choice2) + assert.Equal(272, poll.Choice2_Votes) + + assert.Equal("George Floyd", poll.Choice3) + assert.Equal(829, poll.Choice3_Votes) + + assert.Equal("Derek Chauvin", poll.Choice4) + assert.Equal(2397, poll.Choice4_Votes) +} + +func TestPollHelpers(t *testing.T) { + assert := assert.New(t) + p := Poll{ + Choice1_Votes: 1, + Choice2_Votes: 2, + Choice3_Votes: 3, + Choice4_Votes: 4, + VotingEndsAt: Timestamp{Time: time.Now().Add(10 * time.Second)}, + } + assert.Equal(p.TotalVotes(), 10) + assert.Equal(p.VotePercentage(p.Choice3_Votes), 30.0) + + assert.True(p.IsOpen()) + assert.False(p.IsWinner(p.Choice4_Votes)) + + // End the poll + p.VotingEndsAt = Timestamp{Time: time.Now().Add(-10 * time.Second)} + assert.False(p.IsOpen()) + assert.False(p.IsWinner(p.Choice2_Votes)) + assert.True(p.IsWinner(p.Choice4_Votes)) +} + +func TestParseSingleRetweet(t *testing.T) { + assert := assert.New(t) + require := require.New(t) + data, err := os.ReadFile("test_responses/tweet_that_is_a_retweet.json") + if err != nil { + panic(err) + } + var api_tweet APITweet + err = json.Unmarshal(data, &api_tweet) + require.NoError(err) + + trove, err := api_tweet.ToTweetTrove() + require.NoError(err) + + require.Len(trove.Tweets, 0) + require.Len(trove.Retweets, 1) + + retweet, is_ok := trove.Retweets[TweetID(1404270043018448896)] + require.True(is_ok) + + assert.Equal(TweetID(1404270043018448896), retweet.RetweetID) + assert.Equal(TweetID(1404269989646028804), retweet.TweetID) + assert.Equal(UserID(44067298), retweet.RetweetedByID) + assert.Equal(int64(1623639042), retweet.RetweetedAt.Unix()) +} + +func TestParseAPIUrlCard(t *testing.T) { + assert := assert.New(t) + data, err := os.ReadFile("test_responses/tweet_content/url_card.json") + if err != nil { + panic(err) + } + var apiCard APICard + err = json.Unmarshal(data, &apiCard) + require.NoError(t, err) + + url := ParseAPIUrlCard(apiCard) + assert.Equal("reason.com", url.Domain) + assert.Equal("L.A. Teachers Union Leader: 'There's No Such Thing As Learning Loss'", url.Title) + assert.Equal("\"It’s OK that our babies may not have learned all their times tables,\" says Cecily Myart-Cruz. \"They learned "+ + "resilience.\"", url.Description) + assert.Equal(600, url.ThumbnailWidth) + assert.Equal(315, url.ThumbnailHeight) + assert.Equal("https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x600", url.ThumbnailRemoteUrl) + assert.Equal("od/odDi9EqO_600x600.jpg", url.ThumbnailLocalPath) + assert.Equal(UserID(155581583), url.CreatorID) + assert.Equal(UserID(16467567), url.SiteID) + assert.True(url.HasThumbnail) + assert.False(url.IsContentDownloaded) +} + +func TestParseAPIUrlCardWithPlayer(t *testing.T) { + assert := assert.New(t) + data, err := os.ReadFile("test_responses/tweet_content/url_card_with_player.json") + if err != nil { + panic(err) + } + var apiCard APICard + err = json.Unmarshal(data, &apiCard) + require.NoError(t, err) + + url := ParseAPIUrlCard(apiCard) + assert.Equal("www.youtube.com", url.Domain) + assert.Equal("The Politically Incorrect Guide to the Constitution (Starring Tom...", url.Title) + assert.Equal("Watch this episode on LBRY/Odysee: https://odysee.com/@capitalresearch:5/the-politically-incorrect-guide-to-the:8"+ + "Watch this episode on Rumble: https://rumble...", url.Description) + assert.Equal("https://pbs.twimg.com/card_img/1437849456423194639/_1t0btyt?format=jpg&name=800x320_1", url.ThumbnailRemoteUrl) + assert.Equal("_1/_1t0btyt_800x320_1.jpg", url.ThumbnailLocalPath) + assert.Equal(UserID(10228272), url.SiteID) + assert.True(url.HasThumbnail) + assert.False(url.IsContentDownloaded) +} + +func TestParseAPIUrlCardWithPlayerAndPlaceholderThumbnail(t *testing.T) { + assert := assert.New(t) + data, err := os.ReadFile("test_responses/tweet_content/url_card_with_player_placeholder_image.json") + if err != nil { + panic(err) + } + var apiCard APICard + err = json.Unmarshal(data, &apiCard) + require.NoError(t, err) + + url := ParseAPIUrlCard(apiCard) + assert.Equal("www.youtube.com", url.Domain) + assert.Equal("Did Michael Malice Turn Me into an Anarchist? | Ep 181", url.Title) + assert.Equal("SUBSCRIBE TO THE NEW SHOW W/ ELIJAH & SYDNEY: \"YOU ARE HERE\"YT: https://www.youtube.com/youareheredaily____________"+ + "__________________________________________...", url.Description) + assert.Equal("https://pbs.twimg.com/cards/player-placeholder.png", url.ThumbnailRemoteUrl) + assert.Equal("player-placeholder.png", url.ThumbnailLocalPath) + assert.Equal(UserID(10228272), url.SiteID) + assert.True(url.HasThumbnail) + assert.False(url.IsContentDownloaded) +} + +func TestParseAPIUrlCardWithoutThumbnail(t *testing.T) { + assert := assert.New(t) + data, err := os.ReadFile("test_responses/tweet_content/url_card_without_thumbnail.json") + if err != nil { + panic(err) + } + var apiCard APICard + err = json.Unmarshal(data, &apiCard) + require.NoError(t, err) + + url := ParseAPIUrlCard(apiCard) + assert.Equal("en.m.wikipedia.org", url.Domain) + assert.Equal("Entryism - Wikipedia", url.Title) + assert.Equal("", url.Description) + assert.True(url.HasCard) + assert.False(url.HasThumbnail) +} + +// Should check if a url is a tweet url, and if so, parse it +func TestParseTweetUrl(t *testing.T) { + assert := assert.New(t) + + // Test valid tweet url + url := "https://twitter.com/kanesays23/status/1429583672827465730" + handle, id, is_ok := TryParseTweetUrl(url) + assert.True(is_ok) + assert.Equal(UserHandle("kanesays23"), handle) + assert.Equal(TweetID(1429583672827465730), id) + + // Test url with GET params + handle, id, is_ok = TryParseTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720?s=20") + assert.True(is_ok) + assert.Equal(UserHandle("NerdNoticing"), handle) + assert.Equal(TweetID(1263192389050654720), id) + + // Test a `mobile.twitter.com` url + handle, id, is_ok = TryParseTweetUrl("https://mobile.twitter.com/APhilosophae/status/1497720548540964864") + assert.True(is_ok) + assert.Equal(UserHandle("APhilosophae"), handle) + assert.Equal(TweetID(1497720548540964864), id) + + // Test a `x.com` url + handle, id, is_ok = TryParseTweetUrl("https://x.com/brutedeforce/status/1579695139425222657?s=46") + assert.True(is_ok) + assert.Equal(UserHandle("brutedeforce"), handle) + assert.Equal(TweetID(1579695139425222657), id) + + // Test invalid url + _, _, is_ok = TryParseTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720s=20") + assert.False(is_ok) + + // Test empty string + _, _, is_ok = TryParseTweetUrl("") + assert.False(is_ok) +} + +// Should extract a user handle from a tweet URL, or fail if URL is invalid +func TestParseHandleFromTweetUrl(t *testing.T) { + assert := assert.New(t) + + // Test valid tweet url + url := "https://twitter.com/kanesays23/status/1429583672827465730" + result, err := ParseHandleFromTweetUrl(url) + assert.NoError(err) + assert.Equal(UserHandle("kanesays23"), result) + + // Test url with GET params + result, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720?s=20") + assert.NoError(err) + assert.Equal(UserHandle("NerdNoticing"), result) + + // Test invalid url + _, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720s=20") + assert.Error(err) + + // Test empty string + _, err = ParseHandleFromTweetUrl("") + assert.Error(err) +} + // Should extract a user handle from a shortened tweet URL func TestParseHandleFromShortenedTweetUrl(t *testing.T) { assert := assert.New(t) @@ -211,3 +483,40 @@ func TestGetTinyURLs(t *testing.T) { assert.Equal(u.GetTinyProfileImageUrl(), "https://pbs.twimg.com/profile_images/1208124284/iwRReicO_normal") assert.Equal(u.GetTinyProfileImageLocalPath(), "testUser_profile_iwRReicO_normal.jpg") } + +func TestParseAPIVideo(t *testing.T) { + assert := assert.New(t) + require := require.New(t) + data, err := os.ReadFile("test_responses/tweet_content/video.json") + require.NoError(err) + + var apivideo APIExtendedMedia + err = json.Unmarshal(data, &apivideo) + require.NoError(err) + + video := ParseAPIVideo(apivideo) + assert.Equal(VideoID(1418951950020845568), video.ID) + assert.Equal(1280, video.Height) + assert.Equal(720, video.Width) + assert.Equal("https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/720x1280/sm4iL9_f8Lclh0aa.mp4?tag=12", video.RemoteURL) + assert.Equal("sm/sm4iL9_f8Lclh0aa.mp4", video.LocalFilename) + assert.Equal("https://pbs.twimg.com/ext_tw_video_thumb/1418951950020845568/pu/img/eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailRemoteUrl) + assert.Equal("eU/eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailLocalPath) + assert.Equal(275952, video.ViewCount) + assert.Equal(88300, video.Duration) + assert.False(video.IsDownloaded) +} + +func TestParseGeoblockedVideo(t *testing.T) { + assert := assert.New(t) + require := require.New(t) + data, err := os.ReadFile("test_responses/tweet_content/video_geoblocked.json") + require.NoError(err) + + var apivideo APIExtendedMedia + err = json.Unmarshal(data, &apivideo) + require.NoError(err) + + video := ParseAPIVideo(apivideo) + assert.True(video.IsGeoblocked) +} diff --git a/pkg/scraper/image_test.go b/pkg/scraper/image_test.go deleted file mode 100644 index ce74b31..0000000 --- a/pkg/scraper/image_test.go +++ /dev/null @@ -1,31 +0,0 @@ -package scraper_test - -import ( - "encoding/json" - "os" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - . "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" -) - -func TestParseAPIMedia(t *testing.T) { - assert := assert.New(t) - data, err := os.ReadFile("test_responses/tweet_content/image.json") - if err != nil { - panic(err) - } - var apimedia APIMedia - err = json.Unmarshal(data, &apimedia) - require.NoError(t, err) - - image := ParseAPIMedia(apimedia) - assert.Equal(ImageID(1395882862289772553), image.ID) - assert.Equal("https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg", image.RemoteURL) - assert.Equal(593, image.Width) - assert.Equal(239, image.Height) - assert.Equal("E1/E18sEUrWYAk8dBl.jpg", image.LocalFilename) - assert.False(image.IsDownloaded) -} diff --git a/pkg/scraper/poll_test.go b/pkg/scraper/poll_test.go deleted file mode 100644 index ee879b7..0000000 --- a/pkg/scraper/poll_test.go +++ /dev/null @@ -1,90 +0,0 @@ -package scraper_test - -import ( - "encoding/json" - "os" - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - . "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" -) - -func TestParsePoll2Choices(t *testing.T) { - assert := assert.New(t) - data, err := os.ReadFile("test_responses/tweet_content/poll_card_2_options.json") - if err != nil { - panic(err) - } - var apiCard APICard - err = json.Unmarshal(data, &apiCard) - require.NoError(t, err) - - poll := ParseAPIPoll(apiCard) - assert.Equal(PollID(1457419248461131776), poll.ID) - assert.Equal(2, poll.NumChoices) - assert.Equal(60*60*24, poll.VotingDuration) - assert.Equal(int64(1636397201), poll.VotingEndsAt.Unix()) - assert.Equal(int64(1636318755), poll.LastUpdatedAt.Unix()) - - assert.Less(poll.LastUpdatedAt.Unix(), poll.VotingEndsAt.Unix()) - assert.Equal("Yes", poll.Choice1) - assert.Equal("No", poll.Choice2) - assert.Equal(529, poll.Choice1_Votes) - assert.Equal(2182, poll.Choice2_Votes) -} - -func TestParsePoll4Choices(t *testing.T) { - assert := assert.New(t) - data, err := os.ReadFile("test_responses/tweet_content/poll_card_4_options_ended.json") - if err != nil { - panic(err) - } - var apiCard APICard - err = json.Unmarshal(data, &apiCard) - require.NoError(t, err) - - poll := ParseAPIPoll(apiCard) - assert.Equal(PollID(1455611588854140929), poll.ID) - assert.Equal(4, poll.NumChoices) - assert.Equal(60*60*24, poll.VotingDuration) - assert.Equal(int64(1635966221), poll.VotingEndsAt.Unix()) - assert.Equal(int64(1635966226), poll.LastUpdatedAt.Unix()) - assert.Greater(poll.LastUpdatedAt.Unix(), poll.VotingEndsAt.Unix()) - - assert.Equal("Alec Baldwin", poll.Choice1) - assert.Equal(1669, poll.Choice1_Votes) - - assert.Equal("Andew Cuomo", poll.Choice2) - assert.Equal(272, poll.Choice2_Votes) - - assert.Equal("George Floyd", poll.Choice3) - assert.Equal(829, poll.Choice3_Votes) - - assert.Equal("Derek Chauvin", poll.Choice4) - assert.Equal(2397, poll.Choice4_Votes) -} - -func TestPollHelpers(t *testing.T) { - assert := assert.New(t) - p := Poll{ - Choice1_Votes: 1, - Choice2_Votes: 2, - Choice3_Votes: 3, - Choice4_Votes: 4, - VotingEndsAt: Timestamp{Time: time.Now().Add(10 * time.Second)}, - } - assert.Equal(p.TotalVotes(), 10) - assert.Equal(p.VotePercentage(p.Choice3_Votes), 30.0) - - assert.True(p.IsOpen()) - assert.False(p.IsWinner(p.Choice4_Votes)) - - // End the poll - p.VotingEndsAt = Timestamp{Time: time.Now().Add(-10 * time.Second)} - assert.False(p.IsOpen()) - assert.False(p.IsWinner(p.Choice2_Votes)) - assert.True(p.IsWinner(p.Choice4_Votes)) -} diff --git a/pkg/scraper/retweet_test.go b/pkg/scraper/retweet_test.go deleted file mode 100644 index f58f6c5..0000000 --- a/pkg/scraper/retweet_test.go +++ /dev/null @@ -1,38 +0,0 @@ -package scraper_test - -import ( - "encoding/json" - "os" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - . "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" -) - -func TestParseSingleRetweet(t *testing.T) { - assert := assert.New(t) - require := require.New(t) - data, err := os.ReadFile("test_responses/tweet_that_is_a_retweet.json") - if err != nil { - panic(err) - } - var api_tweet APITweet - err = json.Unmarshal(data, &api_tweet) - require.NoError(err) - - trove, err := api_tweet.ToTweetTrove() - require.NoError(err) - - require.Len(trove.Tweets, 0) - require.Len(trove.Retweets, 1) - - retweet, is_ok := trove.Retweets[TweetID(1404270043018448896)] - require.True(is_ok) - - assert.Equal(TweetID(1404270043018448896), retweet.RetweetID) - assert.Equal(TweetID(1404269989646028804), retweet.TweetID) - assert.Equal(UserID(44067298), retweet.RetweetedByID) - assert.Equal(int64(1623639042), retweet.RetweetedAt.Unix()) -} diff --git a/pkg/scraper/test_responses/api_v2/feed_replying_tweet.json b/pkg/scraper/test_responses/api_v2/feed_replying_tweet.json deleted file mode 100644 index da7cdd4..0000000 --- a/pkg/scraper/test_responses/api_v2/feed_replying_tweet.json +++ /dev/null @@ -1 +0,0 @@ -{"data":{"user":{"result":{"__typename":"User","timeline":{"timeline":{"instructions":[{"type":"TimelineAddEntries","entries":[{"entryId":"tweet-1485374962563264513","sortIndex":"1485374962563264513","content":{"entryType":"TimelineTimelineItem","itemContent":{"itemType":"TimelineTweet","tweet_results":{"result":{"__typename":"Tweet","rest_id":"1485374962563264513","core":{"user_results":{"result":{"__typename":"User","id":"VXNlcjo0NDA2NzI5OA==","rest_id":"44067298","affiliates_highlighted_label":{},"has_nft_avatar":false,"legacy":{"created_at":"Tue Jun 02 05:35:52 +0000 2009","default_profile":false,"default_profile_image":false,"description":"Author of Dear Reader, The New Right & The Anarchist Handbook\nHost of \"YOUR WELCOME\" \nSubject of Ego & Hubris by Harvey Pekar\nHe/Him ⚑\n@SheathUnderwear Model","entities":{"description":{"urls":[]},"url":{"urls":[{"display_url":"amzn.to/3oInafv","expanded_url":"https://amzn.to/3oInafv","url":"https://t.co/7VDFOOtFK2","indices":[0,23]}]}},"fast_followers_count":0,"favourites_count":3840,"followers_count":334571,"friends_count":964,"has_custom_timelines":false,"is_translator":false,"listed_count":1434,"location":"Austin","media_count":9504,"name":"Michael Malice","normal_followers_count":334571,"pinned_tweet_ids_str":["1477347403023982596"],"profile_banner_extensions":{"mediaColor":{"r":{"ok":{"palette":[{"percentage":60.59,"rgb":{"blue":0,"green":0,"red":0}},{"percentage":18.77,"rgb":{"blue":64,"green":60,"red":156}},{"percentage":3.62,"rgb":{"blue":31,"green":29,"red":77}},{"percentage":3.22,"rgb":{"blue":215,"green":199,"red":138}},{"percentage":2.83,"rgb":{"blue":85,"green":79,"red":215}}]}}}},"profile_banner_url":"https://pbs.twimg.com/profile_banners/44067298/1615134676","profile_image_extensions":{"mediaColor":{"r":{"ok":{"palette":[{"percentage":50.78,"rgb":{"blue":249,"green":247,"red":246}},{"percentage":17.4,"rgb":{"blue":51,"green":51,"red":205}},{"percentage":9.43,"rgb":{"blue":124,"green":139,"red":210}},{"percentage":6.38,"rgb":{"blue":47,"green":63,"red":116}},{"percentage":3.17,"rgb":{"blue":65,"green":45,"red":46}}]}}}},"profile_image_url_https":"https://pbs.twimg.com/profile_images/1415820415314931715/_VVX4GI8_normal.jpg","profile_interstitial_type":"","protected":false,"screen_name":"michaelmalice","statuses_count":138682,"translator_type":"none","url":"https://t.co/7VDFOOtFK2","verified":true,"withheld_in_countries":[]},"super_follow_eligible":false,"super_followed_by":false,"super_following":false}}},"legacy":{"created_at":"Sun Jan 23 22:12:42 +0000 2022","conversation_id_str":"1485374031821946888","display_text_range":[14,37],"entities":{"user_mentions":[{"id_str":"393067691","name":"Why Is Everything Stupid?","screen_name":"KyleSouther1","indices":[0,13]}],"urls":[],"hashtags":[],"symbols":[]},"favorite_count":440,"favorited":false,"full_text":"@KyleSouther1 thats a really good one","in_reply_to_screen_name":"KyleSouther1","in_reply_to_status_id_str":"1485374790932201472","in_reply_to_user_id_str":"393067691","is_quote_status":false,"lang":"en","quote_count":0,"reply_count":8,"retweet_count":0,"retweeted":false,"source":"Twitter Web App","user_id_str":"44067298","id_str":"1485374962563264513"}}},"tweetDisplayType":"Tweet","ruxContext":"HHwWgsC9nZOqjp0pAAAA"}}}]},{"type":"TimelinePinEntry","entry":{"entryId":"tweet-1477347403023982596","sortIndex":"1485710641850220544","content":{"entryType":"TimelineTimelineItem","itemContent":{"itemType":"TimelineTweet","tweet_results":{"result":{"__typename":"Tweet","rest_id":"1477347403023982596","core":{"user_results":{"result":{"__typename":"User","id":"VXNlcjo0NDA2NzI5OA==","rest_id":"44067298","affiliates_highlighted_label":{},"has_nft_avatar":false,"legacy":{"created_at":"Tue Jun 02 05:35:52 +0000 2009","default_profile":false,"default_profile_image":false,"description":"Author of Dear Reader, The New Right & The Anarchist Handbook\nHost of \"YOUR WELCOME\" \nSubject of Ego & Hubris by Harvey Pekar\nHe/Him ⚑\n@SheathUnderwear Model","entities":{"description":{"urls":[]},"url":{"urls":[{"display_url":"amzn.to/3oInafv","expanded_url":"https://amzn.to/3oInafv","url":"https://t.co/7VDFOOtFK2","indices":[0,23]}]}},"fast_followers_count":0,"favourites_count":3840,"followers_count":334571,"friends_count":964,"has_custom_timelines":false,"is_translator":false,"listed_count":1434,"location":"Austin","media_count":9504,"name":"Michael Malice","normal_followers_count":334571,"pinned_tweet_ids_str":["1477347403023982596"],"profile_banner_extensions":{"mediaColor":{"r":{"ok":{"palette":[{"percentage":60.59,"rgb":{"blue":0,"green":0,"red":0}},{"percentage":18.77,"rgb":{"blue":64,"green":60,"red":156}},{"percentage":3.62,"rgb":{"blue":31,"green":29,"red":77}},{"percentage":3.22,"rgb":{"blue":215,"green":199,"red":138}},{"percentage":2.83,"rgb":{"blue":85,"green":79,"red":215}}]}}}},"profile_banner_url":"https://pbs.twimg.com/profile_banners/44067298/1615134676","profile_image_extensions":{"mediaColor":{"r":{"ok":{"palette":[{"percentage":50.78,"rgb":{"blue":249,"green":247,"red":246}},{"percentage":17.4,"rgb":{"blue":51,"green":51,"red":205}},{"percentage":9.43,"rgb":{"blue":124,"green":139,"red":210}},{"percentage":6.38,"rgb":{"blue":47,"green":63,"red":116}},{"percentage":3.17,"rgb":{"blue":65,"green":45,"red":46}}]}}}},"profile_image_url_https":"https://pbs.twimg.com/profile_images/1415820415314931715/_VVX4GI8_normal.jpg","profile_interstitial_type":"","protected":false,"screen_name":"michaelmalice","statuses_count":138682,"translator_type":"none","url":"https://t.co/7VDFOOtFK2","verified":true,"withheld_in_countries":[]},"super_follow_eligible":false,"super_followed_by":false,"super_following":false}}},"legacy":{"created_at":"Sat Jan 01 18:34:03 +0000 2022","conversation_id_str":"1469757271341256705","display_text_range":[0,76],"entities":{"media":[{"display_url":"pic.twitter.com/ra6b4YhsDI","expanded_url":"https://twitter.com/michaelmalice/status/1477347403023982596/photo/1","id_str":"1477347346367369228","indices":[77,100],"media_url_https":"https://pbs.twimg.com/media/FICXlY4X0AwyB2m.jpg","type":"photo","url":"https://t.co/ra6b4YhsDI","features":{"large":{"faces":[]},"medium":{"faces":[]},"small":{"faces":[]},"orig":{"faces":[]}},"sizes":{"large":{"h":900,"w":1200,"resize":"fit"},"medium":{"h":900,"w":1200,"resize":"fit"},"small":{"h":510,"w":680,"resize":"fit"},"thumb":{"h":150,"w":150,"resize":"crop"}},"original_info":{"height":900,"width":1200,"focus_rects":[{"x":0,"y":0,"w":1200,"h":672},{"x":0,"y":0,"w":900,"h":900},{"x":0,"y":0,"w":789,"h":900},{"x":105,"y":0,"w":450,"h":900},{"x":0,"y":0,"w":1200,"h":900}]}}],"user_mentions":[],"urls":[{"display_url":"AnarchistAudiobook.com","expanded_url":"http://AnarchistAudiobook.com","url":"https://t.co/6lCDVUPz05","indices":[53,76]}],"hashtags":[],"symbols":[]},"extended_entities":{"media":[{"display_url":"pic.twitter.com/ra6b4YhsDI","expanded_url":"https://twitter.com/michaelmalice/status/1477347403023982596/photo/1","id_str":"1477347346367369228","indices":[77,100],"media_key":"3_1477347346367369228","media_url_https":"https://pbs.twimg.com/media/FICXlY4X0AwyB2m.jpg","type":"photo","url":"https://t.co/ra6b4YhsDI","ext_media_color":{"palette":[{"percentage":47.83,"rgb":{"blue":13,"green":13,"red":21}},{"percentage":14.09,"rgb":{"blue":56,"green":88,"red":126}},{"percentage":6.88,"rgb":{"blue":183,"green":223,"red":252}},{"percentage":5.64,"rgb":{"blue":109,"green":69,"red":240}},{"percentage":4.09,"rgb":{"blue":47,"green":21,"red":34}}]},"ext_media_availability":{"status":"Available"},"features":{"large":{"faces":[]},"medium":{"faces":[]},"small":{"faces":[]},"orig":{"faces":[]}},"sizes":{"large":{"h":900,"w":1200,"resize":"fit"},"medium":{"h":900,"w":1200,"resize":"fit"},"small":{"h":510,"w":680,"resize":"fit"},"thumb":{"h":150,"w":150,"resize":"crop"}},"original_info":{"height":900,"width":1200,"focus_rects":[{"x":0,"y":0,"w":1200,"h":672},{"x":0,"y":0,"w":900,"h":900},{"x":0,"y":0,"w":789,"h":900},{"x":105,"y":0,"w":450,"h":900},{"x":0,"y":0,"w":1200,"h":900}]}}]},"favorite_count":856,"favorited":false,"full_text":"THE GRAND FINALE\n\nAn all-star cast, available now at https://t.co/6lCDVUPz05 https://t.co/ra6b4YhsDI","in_reply_to_screen_name":"michaelmalice","in_reply_to_status_id_str":"1476988732880347140","in_reply_to_user_id_str":"44067298","is_quote_status":false,"lang":"en","possibly_sensitive":false,"possibly_sensitive_editable":true,"quote_count":8,"reply_count":63,"retweet_count":62,"retweeted":false,"source":"Twitter Web App","user_id_str":"44067298","id_str":"1477347403023982596","self_thread":{"id_str":"1469757271341256705"}}}},"tweetDisplayType":"Tweet","ruxContext":"HHwWiIC58dfoy4ApAAAA","socialContext":{"type":"TimelineGeneralContext","contextType":"Pin","text":"Pinned Tweet"}},"clientEventInfo":{"component":"suggest_pinned_tweet","details":{"timelinesDetails":{"injectionType":"PinnedTweet"}}}}}}]}}}}}} diff --git a/pkg/scraper/url_test.go b/pkg/scraper/url_test.go deleted file mode 100644 index 3ed3300..0000000 --- a/pkg/scraper/url_test.go +++ /dev/null @@ -1,165 +0,0 @@ -package scraper_test - -import ( - "encoding/json" - "os" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - . "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" -) - -func TestParseAPIUrlCard(t *testing.T) { - assert := assert.New(t) - data, err := os.ReadFile("test_responses/tweet_content/url_card.json") - if err != nil { - panic(err) - } - var apiCard APICard - err = json.Unmarshal(data, &apiCard) - require.NoError(t, err) - - url := ParseAPIUrlCard(apiCard) - assert.Equal("reason.com", url.Domain) - assert.Equal("L.A. Teachers Union Leader: 'There's No Such Thing As Learning Loss'", url.Title) - assert.Equal("\"It’s OK that our babies may not have learned all their times tables,\" says Cecily Myart-Cruz. \"They learned "+ - "resilience.\"", url.Description) - assert.Equal(600, url.ThumbnailWidth) - assert.Equal(315, url.ThumbnailHeight) - assert.Equal("https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x600", url.ThumbnailRemoteUrl) - assert.Equal("od/odDi9EqO_600x600.jpg", url.ThumbnailLocalPath) - assert.Equal(UserID(155581583), url.CreatorID) - assert.Equal(UserID(16467567), url.SiteID) - assert.True(url.HasThumbnail) - assert.False(url.IsContentDownloaded) -} - -func TestParseAPIUrlCardWithPlayer(t *testing.T) { - assert := assert.New(t) - data, err := os.ReadFile("test_responses/tweet_content/url_card_with_player.json") - if err != nil { - panic(err) - } - var apiCard APICard - err = json.Unmarshal(data, &apiCard) - require.NoError(t, err) - - url := ParseAPIUrlCard(apiCard) - assert.Equal("www.youtube.com", url.Domain) - assert.Equal("The Politically Incorrect Guide to the Constitution (Starring Tom...", url.Title) - assert.Equal("Watch this episode on LBRY/Odysee: https://odysee.com/@capitalresearch:5/the-politically-incorrect-guide-to-the:8"+ - "Watch this episode on Rumble: https://rumble...", url.Description) - assert.Equal("https://pbs.twimg.com/card_img/1437849456423194639/_1t0btyt?format=jpg&name=800x320_1", url.ThumbnailRemoteUrl) - assert.Equal("_1/_1t0btyt_800x320_1.jpg", url.ThumbnailLocalPath) - assert.Equal(UserID(10228272), url.SiteID) - assert.True(url.HasThumbnail) - assert.False(url.IsContentDownloaded) -} - -func TestParseAPIUrlCardWithPlayerAndPlaceholderThumbnail(t *testing.T) { - assert := assert.New(t) - data, err := os.ReadFile("test_responses/tweet_content/url_card_with_player_placeholder_image.json") - if err != nil { - panic(err) - } - var apiCard APICard - err = json.Unmarshal(data, &apiCard) - require.NoError(t, err) - - url := ParseAPIUrlCard(apiCard) - assert.Equal("www.youtube.com", url.Domain) - assert.Equal("Did Michael Malice Turn Me into an Anarchist? | Ep 181", url.Title) - assert.Equal("SUBSCRIBE TO THE NEW SHOW W/ ELIJAH & SYDNEY: \"YOU ARE HERE\"YT: https://www.youtube.com/youareheredaily____________"+ - "__________________________________________...", url.Description) - assert.Equal("https://pbs.twimg.com/cards/player-placeholder.png", url.ThumbnailRemoteUrl) - assert.Equal("player-placeholder.png", url.ThumbnailLocalPath) - assert.Equal(UserID(10228272), url.SiteID) - assert.True(url.HasThumbnail) - assert.False(url.IsContentDownloaded) -} - -func TestParseAPIUrlCardWithoutThumbnail(t *testing.T) { - assert := assert.New(t) - data, err := os.ReadFile("test_responses/tweet_content/url_card_without_thumbnail.json") - if err != nil { - panic(err) - } - var apiCard APICard - err = json.Unmarshal(data, &apiCard) - require.NoError(t, err) - - url := ParseAPIUrlCard(apiCard) - assert.Equal("en.m.wikipedia.org", url.Domain) - assert.Equal("Entryism - Wikipedia", url.Title) - assert.Equal("", url.Description) - assert.True(url.HasCard) - assert.False(url.HasThumbnail) -} - -/** - * Should check if a url is a tweet url, and if so, parse it - */ -func TestParseTweetUrl(t *testing.T) { - assert := assert.New(t) - - // Test valid tweet url - url := "https://twitter.com/kanesays23/status/1429583672827465730" - handle, id, is_ok := TryParseTweetUrl(url) - assert.True(is_ok) - assert.Equal(UserHandle("kanesays23"), handle) - assert.Equal(TweetID(1429583672827465730), id) - - // Test url with GET params - handle, id, is_ok = TryParseTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720?s=20") - assert.True(is_ok) - assert.Equal(UserHandle("NerdNoticing"), handle) - assert.Equal(TweetID(1263192389050654720), id) - - // Test a `mobile.twitter.com` url - handle, id, is_ok = TryParseTweetUrl("https://mobile.twitter.com/APhilosophae/status/1497720548540964864") - assert.True(is_ok) - assert.Equal(UserHandle("APhilosophae"), handle) - assert.Equal(TweetID(1497720548540964864), id) - - // Test a `x.com` url - handle, id, is_ok = TryParseTweetUrl("https://x.com/brutedeforce/status/1579695139425222657?s=46") - assert.True(is_ok) - assert.Equal(UserHandle("brutedeforce"), handle) - assert.Equal(TweetID(1579695139425222657), id) - - // Test invalid url - _, _, is_ok = TryParseTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720s=20") - assert.False(is_ok) - - // Test empty string - _, _, is_ok = TryParseTweetUrl("") - assert.False(is_ok) -} - -/** - * Should extract a user handle from a tweet URL, or fail if URL is invalid - */ -func TestParseHandleFromTweetUrl(t *testing.T) { - assert := assert.New(t) - - // Test valid tweet url - url := "https://twitter.com/kanesays23/status/1429583672827465730" - result, err := ParseHandleFromTweetUrl(url) - assert.NoError(err) - assert.Equal(UserHandle("kanesays23"), result) - - // Test url with GET params - result, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720?s=20") - assert.NoError(err) - assert.Equal(UserHandle("NerdNoticing"), result) - - // Test invalid url - _, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720s=20") - assert.Error(err) - - // Test empty string - _, err = ParseHandleFromTweetUrl("") - assert.Error(err) -} diff --git a/pkg/scraper/video_test.go b/pkg/scraper/video_test.go deleted file mode 100644 index d042442..0000000 --- a/pkg/scraper/video_test.go +++ /dev/null @@ -1,49 +0,0 @@ -package scraper_test - -import ( - "encoding/json" - "os" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - . "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" -) - -func TestParseAPIVideo(t *testing.T) { - assert := assert.New(t) - require := require.New(t) - data, err := os.ReadFile("test_responses/tweet_content/video.json") - require.NoError(err) - - var apivideo APIExtendedMedia - err = json.Unmarshal(data, &apivideo) - require.NoError(err) - - video := ParseAPIVideo(apivideo) - assert.Equal(VideoID(1418951950020845568), video.ID) - assert.Equal(1280, video.Height) - assert.Equal(720, video.Width) - assert.Equal("https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/720x1280/sm4iL9_f8Lclh0aa.mp4?tag=12", video.RemoteURL) - assert.Equal("sm/sm4iL9_f8Lclh0aa.mp4", video.LocalFilename) - assert.Equal("https://pbs.twimg.com/ext_tw_video_thumb/1418951950020845568/pu/img/eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailRemoteUrl) - assert.Equal("eU/eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailLocalPath) - assert.Equal(275952, video.ViewCount) - assert.Equal(88300, video.Duration) - assert.False(video.IsDownloaded) -} - -func TestParseGeoblockedVideo(t *testing.T) { - assert := assert.New(t) - require := require.New(t) - data, err := os.ReadFile("test_responses/tweet_content/video_geoblocked.json") - require.NoError(err) - - var apivideo APIExtendedMedia - err = json.Unmarshal(data, &apivideo) - require.NoError(err) - - video := ParseAPIVideo(apivideo) - assert.True(video.IsGeoblocked) -}