From f22f15f3d959ce1d3f92d23a8c1430fc2a072262 Mon Sep 17 00:00:00 2001 From: Alessio Date: Wed, 2 Mar 2022 14:34:42 -0800 Subject: [PATCH] Prevent quoted-tweet URLs from being added as URLs (they should just be quote-tweets) --- scraper/tweet.go | 12 ++++++--- scraper/tweet_test.go | 19 ++++++++++++++ scraper/url.go | 58 ++++++++++++++++++++++++++++++++++--------- scraper/url_test.go | 54 ++++++++++++++++++++++++++++++++++++++++ scraper/user.go | 16 ------------ scraper/user_test.go | 27 -------------------- 6 files changed, 128 insertions(+), 58 deletions(-) diff --git a/scraper/tweet.go b/scraper/tweet.go index 4f1e11d..9b594c8 100644 --- a/scraper/tweet.go +++ b/scraper/tweet.go @@ -102,6 +102,7 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { ret.NumReplies = apiTweet.ReplyCount ret.NumQuoteTweets = apiTweet.QuoteCount ret.InReplyToID = TweetID(apiTweet.InReplyToStatusID) + ret.QuotedTweetID = TweetID(apiTweet.QuotedStatusID) // Process URLs and link previews for _, url := range apiTweet.Entities.URLs { @@ -112,14 +113,20 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { url_object.Text = url.ExpandedURL url_object.ShortText = url.ShortenedUrl url_object.TweetID = ret.ID + + // Skip it if it's just the quoted tweet + _, id, is_ok := TryParseTweetUrl(url.ExpandedURL) + if is_ok && id == ret.QuotedTweetID { + continue + } + ret.Urls = append(ret.Urls, url_object) } // Process images for _, media := range apiTweet.Entities.Media { if media.Type != "photo" { // TODO: remove this eventually - panic_str := fmt.Sprintf("Unknown media type: %q", media.Type) - panic(panic_str) + panic(fmt.Sprintf("Unknown media type: %q", media.Type)) } new_image := ParseAPIMedia(media) new_image.TweetID = ret.ID @@ -144,7 +151,6 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { } } - ret.QuotedTweetID = TweetID(apiTweet.QuotedStatusID) // Process videos for _, entity := range apiTweet.ExtendedEntities.Media { diff --git a/scraper/tweet_test.go b/scraper/tweet_test.go index 3476160..13cb63c 100644 --- a/scraper/tweet_test.go +++ b/scraper/tweet_test.go @@ -50,6 +50,9 @@ func TestParseTweetWithImage(t *testing.T) { assert.Len(tweet.Images, 1) } +/** + * Ensure the fake url (link to the quoted tweet) is not parsed as a URL; it should just be ignored + */ func TestParseTweetWithQuotedTweetAsLink(t *testing.T) { assert := assert.New(t) tweet := load_tweet_from_file("test_responses/single_tweets/tweet_with_quoted_tweet_as_link2.json") @@ -59,6 +62,22 @@ func TestParseTweetWithQuotedTweetAsLink(t *testing.T) { assert.Equal(TweetID(1396194494710788100), tweet.QuotedTweetID) assert.Empty(tweet.ReplyMentions) assert.Empty(tweet.Polls) + assert.Empty(tweet.Urls) +} + +/** + * Quote-tweets with links should work properly + */ +func TestParseTweetWithQuotedTweetAndLink(t *testing.T) { + assert := assert.New(t) + tweet := load_tweet_from_file("test_responses/single_tweets/tweet_with_quoted_tweet_and_url.json") + + assert.Equal("This is video he’s talking about. Please watch. Is there a single US politician capable of doing this with the weasels and rats running American industry today?", tweet.Text) + assert.Equal(TweetID(1497997890999898115), tweet.QuotedTweetID) + + assert.Len(tweet.Urls, 1) + url := tweet.Urls[0] + assert.Equal(url.Text, "https://youtu.be/VjrlTMvirVo") } func TestParseTweetWithVideo(t *testing.T) { diff --git a/scraper/url.go b/scraper/url.go index b812cfe..71b3d65 100644 --- a/scraper/url.go +++ b/scraper/url.go @@ -3,6 +3,7 @@ package scraper import ( "fmt" "path" + "regexp" "net/url" ) @@ -60,17 +61,50 @@ func ParseAPIUrlCard(apiCard APICard) Url { } func get_thumbnail_local_path(remote_url string) string { - u, err := url.Parse(remote_url) - if err != nil { - panic(err) - } - if u.RawQuery == "" { - return path.Base(u.Path) - } - query_params, err := url.ParseQuery(u.RawQuery) - if err != nil { - panic(err) - } + u, err := url.Parse(remote_url) + if err != nil { + panic(err) + } + if u.RawQuery == "" { + return path.Base(u.Path) + } + query_params, err := url.ParseQuery(u.RawQuery) + if err != nil { + panic(err) + } - return fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0]) + return fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0]) +} + +/** + * Given an URL, try to parse it as a tweet url. + * The bool is an `is_ok` value; true if the parse was successful, false if it didn't match + */ +func TryParseTweetUrl(url string) (UserHandle, TweetID, bool) { + r := regexp.MustCompile(`^https://twitter.com/(\w+)/status/(\d+)(?:\?.*)?$`) + matches := r.FindStringSubmatch(url) + if matches == nil { + return UserHandle(""), TweetID(0), false + } + if len(matches) != 3 { // matches[0] is the full string + panic(matches) + } + return UserHandle(matches[1]), TweetID(int_or_panic(matches[2])), true +} + +/** + * Given a tweet URL, return the corresponding user handle. + * If tweet url is not valid, return an error. + */ +func ParseHandleFromTweetUrl(tweet_url string) (UserHandle, error) { + short_url_regex := regexp.MustCompile(`^https://t.co/\w{5,20}$`) + if short_url_regex.MatchString(tweet_url) { + tweet_url = ExpandShortUrl(tweet_url) + } + + ret, _, is_ok := TryParseTweetUrl(tweet_url) + if !is_ok { + return "", fmt.Errorf("Invalid tweet url: %s", tweet_url) + } + return ret, nil } diff --git a/scraper/url_test.go b/scraper/url_test.go index 18f3f48..f85e6c1 100644 --- a/scraper/url_test.go +++ b/scraper/url_test.go @@ -94,3 +94,57 @@ func TestParseAPIUrlCardWithoutThumbnail(t *testing.T) { assert.True(url.HasCard) assert.False(url.HasThumbnail) } + +/** + * Should check if a url is a tweet url, and if so, parse it + */ +func TestParseTweetUrl(t *testing.T) { + assert:= assert.New(t) + + // Test valid tweet url + url := "https://twitter.com/kanesays23/status/1429583672827465730" + handle, id, is_ok := TryParseTweetUrl(url) + assert.True(is_ok) + assert.Equal(UserHandle("kanesays23"), handle) + assert.Equal(TweetID(1429583672827465730), id) + + // Test url with GET params + handle, id, is_ok = TryParseTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720?s=20") + assert.True(is_ok) + assert.Equal(UserHandle("NerdNoticing"), handle) + assert.Equal(TweetID(1263192389050654720), id) + + // Test invalid url + _, _, is_ok = TryParseTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720s=20") + assert.False(is_ok) + + // Test empty string + _, _, is_ok = TryParseTweetUrl("") + assert.False(is_ok) +} + +/** + * Should extract a user handle from a tweet URL, or fail if URL is invalid + */ +func TestParseHandleFromTweetUrl(t *testing.T) { + assert := assert.New(t) + + // Test valid tweet url + url := "https://twitter.com/kanesays23/status/1429583672827465730" + result, err := ParseHandleFromTweetUrl(url) + assert.NoError(err) + assert.Equal(UserHandle("kanesays23"), result) + + // Test url with GET params + result, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720?s=20") + assert.NoError(err) + assert.Equal(UserHandle("NerdNoticing"), result) + + // Test invalid url + _, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720s=20") + assert.Error(err) + + // Test empty string + _, err = ParseHandleFromTweetUrl("") + assert.Error(err) +} diff --git a/scraper/user.go b/scraper/user.go index 61a1284..ba8d0cb 100644 --- a/scraper/user.go +++ b/scraper/user.go @@ -84,23 +84,7 @@ Joined %s return ret } -/** - * Given a tweet URL, return the corresponding user handle. - * If tweet url is not valid, return an error. - */ -func ParseHandleFromTweetUrl(tweet_url string) (UserHandle, error) { - short_url_regex := regexp.MustCompile(`^https://t.co/\w{5,20}$`) - if short_url_regex.MatchString(tweet_url) { - tweet_url = ExpandShortUrl(tweet_url) - } - r := regexp.MustCompile(`^https://twitter.com/(\w+)/status/\d+(?:\?.*)?$`) - matches := r.FindStringSubmatch(tweet_url) - if len(matches) != 2 { // matches[0] is the full string - return "", fmt.Errorf("Invalid tweet url: %s", tweet_url) - } - return UserHandle(matches[1]), nil -} /** * Unknown Users with handles are only created by direct GetUser calls (either `twitter fetch_user` diff --git a/scraper/user_test.go b/scraper/user_test.go index e00bee2..166bd3d 100644 --- a/scraper/user_test.go +++ b/scraper/user_test.go @@ -104,33 +104,6 @@ func TestParseDeletedUser(t *testing.T) { assert.Equal("default_profile.png", user.GetTinyProfileImageLocalPath()) } -/** - * Should extract a user handle from a tweet URL, or fail if URL is invalid - */ -func TestParseHandleFromTweetUrl(t *testing.T) { - assert := assert.New(t) - - // Test valid tweet url - url := "https://twitter.com/kanesays23/status/1429583672827465730" - result, err := ParseHandleFromTweetUrl(url) - assert.NoError(err) - assert.Equal(UserHandle("kanesays23"), result) - - // Test url with GET params - result, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720?s=20") - assert.NoError(err) - assert.Equal(UserHandle("NerdNoticing"), result) - - // Test invalid url - _, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720s=20") - assert.Error(err) - - // Test empty string - _, err = ParseHandleFromTweetUrl("") - assert.Error(err) -} - - /** * Should extract a user handle from a shortened tweet URL */