Prevent quoted-tweet URLs from being added as URLs (they should just be quote-tweets)

This commit is contained in:
Alessio 2022-03-02 14:34:42 -08:00
parent aa961b9ff4
commit f22f15f3d9
6 changed files with 128 additions and 58 deletions

View File

@ -102,6 +102,7 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
ret.NumReplies = apiTweet.ReplyCount ret.NumReplies = apiTweet.ReplyCount
ret.NumQuoteTweets = apiTweet.QuoteCount ret.NumQuoteTweets = apiTweet.QuoteCount
ret.InReplyToID = TweetID(apiTweet.InReplyToStatusID) ret.InReplyToID = TweetID(apiTweet.InReplyToStatusID)
ret.QuotedTweetID = TweetID(apiTweet.QuotedStatusID)
// Process URLs and link previews // Process URLs and link previews
for _, url := range apiTweet.Entities.URLs { for _, url := range apiTweet.Entities.URLs {
@ -112,14 +113,20 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
url_object.Text = url.ExpandedURL url_object.Text = url.ExpandedURL
url_object.ShortText = url.ShortenedUrl url_object.ShortText = url.ShortenedUrl
url_object.TweetID = ret.ID url_object.TweetID = ret.ID
// Skip it if it's just the quoted tweet
_, id, is_ok := TryParseTweetUrl(url.ExpandedURL)
if is_ok && id == ret.QuotedTweetID {
continue
}
ret.Urls = append(ret.Urls, url_object) ret.Urls = append(ret.Urls, url_object)
} }
// Process images // Process images
for _, media := range apiTweet.Entities.Media { for _, media := range apiTweet.Entities.Media {
if media.Type != "photo" { // TODO: remove this eventually if media.Type != "photo" { // TODO: remove this eventually
panic_str := fmt.Sprintf("Unknown media type: %q", media.Type) panic(fmt.Sprintf("Unknown media type: %q", media.Type))
panic(panic_str)
} }
new_image := ParseAPIMedia(media) new_image := ParseAPIMedia(media)
new_image.TweetID = ret.ID new_image.TweetID = ret.ID
@ -144,7 +151,6 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
} }
} }
ret.QuotedTweetID = TweetID(apiTweet.QuotedStatusID)
// Process videos // Process videos
for _, entity := range apiTweet.ExtendedEntities.Media { for _, entity := range apiTweet.ExtendedEntities.Media {

View File

@ -50,6 +50,9 @@ func TestParseTweetWithImage(t *testing.T) {
assert.Len(tweet.Images, 1) assert.Len(tweet.Images, 1)
} }
/**
* Ensure the fake url (link to the quoted tweet) is not parsed as a URL; it should just be ignored
*/
func TestParseTweetWithQuotedTweetAsLink(t *testing.T) { func TestParseTweetWithQuotedTweetAsLink(t *testing.T) {
assert := assert.New(t) assert := assert.New(t)
tweet := load_tweet_from_file("test_responses/single_tweets/tweet_with_quoted_tweet_as_link2.json") tweet := load_tweet_from_file("test_responses/single_tweets/tweet_with_quoted_tweet_as_link2.json")
@ -59,6 +62,22 @@ func TestParseTweetWithQuotedTweetAsLink(t *testing.T) {
assert.Equal(TweetID(1396194494710788100), tweet.QuotedTweetID) assert.Equal(TweetID(1396194494710788100), tweet.QuotedTweetID)
assert.Empty(tweet.ReplyMentions) assert.Empty(tweet.ReplyMentions)
assert.Empty(tweet.Polls) assert.Empty(tweet.Polls)
assert.Empty(tweet.Urls)
}
/**
* Quote-tweets with links should work properly
*/
func TestParseTweetWithQuotedTweetAndLink(t *testing.T) {
assert := assert.New(t)
tweet := load_tweet_from_file("test_responses/single_tweets/tweet_with_quoted_tweet_and_url.json")
assert.Equal("This is video hes talking about. Please watch. Is there a single US politician capable of doing this with the weasels and rats running American industry today?", tweet.Text)
assert.Equal(TweetID(1497997890999898115), tweet.QuotedTweetID)
assert.Len(tweet.Urls, 1)
url := tweet.Urls[0]
assert.Equal(url.Text, "https://youtu.be/VjrlTMvirVo")
} }
func TestParseTweetWithVideo(t *testing.T) { func TestParseTweetWithVideo(t *testing.T) {

View File

@ -3,6 +3,7 @@ package scraper
import ( import (
"fmt" "fmt"
"path" "path"
"regexp"
"net/url" "net/url"
) )
@ -60,17 +61,50 @@ func ParseAPIUrlCard(apiCard APICard) Url {
} }
func get_thumbnail_local_path(remote_url string) string { func get_thumbnail_local_path(remote_url string) string {
u, err := url.Parse(remote_url) u, err := url.Parse(remote_url)
if err != nil { if err != nil {
panic(err) panic(err)
} }
if u.RawQuery == "" { if u.RawQuery == "" {
return path.Base(u.Path) return path.Base(u.Path)
} }
query_params, err := url.ParseQuery(u.RawQuery) query_params, err := url.ParseQuery(u.RawQuery)
if err != nil { if err != nil {
panic(err) panic(err)
} }
return fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0]) return fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0])
}
/**
* Given an URL, try to parse it as a tweet url.
* The bool is an `is_ok` value; true if the parse was successful, false if it didn't match
*/
func TryParseTweetUrl(url string) (UserHandle, TweetID, bool) {
r := regexp.MustCompile(`^https://twitter.com/(\w+)/status/(\d+)(?:\?.*)?$`)
matches := r.FindStringSubmatch(url)
if matches == nil {
return UserHandle(""), TweetID(0), false
}
if len(matches) != 3 { // matches[0] is the full string
panic(matches)
}
return UserHandle(matches[1]), TweetID(int_or_panic(matches[2])), true
}
/**
* Given a tweet URL, return the corresponding user handle.
* If tweet url is not valid, return an error.
*/
func ParseHandleFromTweetUrl(tweet_url string) (UserHandle, error) {
short_url_regex := regexp.MustCompile(`^https://t.co/\w{5,20}$`)
if short_url_regex.MatchString(tweet_url) {
tweet_url = ExpandShortUrl(tweet_url)
}
ret, _, is_ok := TryParseTweetUrl(tweet_url)
if !is_ok {
return "", fmt.Errorf("Invalid tweet url: %s", tweet_url)
}
return ret, nil
} }

View File

@ -94,3 +94,57 @@ func TestParseAPIUrlCardWithoutThumbnail(t *testing.T) {
assert.True(url.HasCard) assert.True(url.HasCard)
assert.False(url.HasThumbnail) assert.False(url.HasThumbnail)
} }
/**
* Should check if a url is a tweet url, and if so, parse it
*/
func TestParseTweetUrl(t *testing.T) {
assert:= assert.New(t)
// Test valid tweet url
url := "https://twitter.com/kanesays23/status/1429583672827465730"
handle, id, is_ok := TryParseTweetUrl(url)
assert.True(is_ok)
assert.Equal(UserHandle("kanesays23"), handle)
assert.Equal(TweetID(1429583672827465730), id)
// Test url with GET params
handle, id, is_ok = TryParseTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720?s=20")
assert.True(is_ok)
assert.Equal(UserHandle("NerdNoticing"), handle)
assert.Equal(TweetID(1263192389050654720), id)
// Test invalid url
_, _, is_ok = TryParseTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720s=20")
assert.False(is_ok)
// Test empty string
_, _, is_ok = TryParseTweetUrl("")
assert.False(is_ok)
}
/**
* Should extract a user handle from a tweet URL, or fail if URL is invalid
*/
func TestParseHandleFromTweetUrl(t *testing.T) {
assert := assert.New(t)
// Test valid tweet url
url := "https://twitter.com/kanesays23/status/1429583672827465730"
result, err := ParseHandleFromTweetUrl(url)
assert.NoError(err)
assert.Equal(UserHandle("kanesays23"), result)
// Test url with GET params
result, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720?s=20")
assert.NoError(err)
assert.Equal(UserHandle("NerdNoticing"), result)
// Test invalid url
_, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720s=20")
assert.Error(err)
// Test empty string
_, err = ParseHandleFromTweetUrl("")
assert.Error(err)
}

View File

@ -84,23 +84,7 @@ Joined %s
return ret return ret
} }
/**
* Given a tweet URL, return the corresponding user handle.
* If tweet url is not valid, return an error.
*/
func ParseHandleFromTweetUrl(tweet_url string) (UserHandle, error) {
short_url_regex := regexp.MustCompile(`^https://t.co/\w{5,20}$`)
if short_url_regex.MatchString(tweet_url) {
tweet_url = ExpandShortUrl(tweet_url)
}
r := regexp.MustCompile(`^https://twitter.com/(\w+)/status/\d+(?:\?.*)?$`)
matches := r.FindStringSubmatch(tweet_url)
if len(matches) != 2 { // matches[0] is the full string
return "", fmt.Errorf("Invalid tweet url: %s", tweet_url)
}
return UserHandle(matches[1]), nil
}
/** /**
* Unknown Users with handles are only created by direct GetUser calls (either `twitter fetch_user` * Unknown Users with handles are only created by direct GetUser calls (either `twitter fetch_user`

View File

@ -104,33 +104,6 @@ func TestParseDeletedUser(t *testing.T) {
assert.Equal("default_profile.png", user.GetTinyProfileImageLocalPath()) assert.Equal("default_profile.png", user.GetTinyProfileImageLocalPath())
} }
/**
* Should extract a user handle from a tweet URL, or fail if URL is invalid
*/
func TestParseHandleFromTweetUrl(t *testing.T) {
assert := assert.New(t)
// Test valid tweet url
url := "https://twitter.com/kanesays23/status/1429583672827465730"
result, err := ParseHandleFromTweetUrl(url)
assert.NoError(err)
assert.Equal(UserHandle("kanesays23"), result)
// Test url with GET params
result, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720?s=20")
assert.NoError(err)
assert.Equal(UserHandle("NerdNoticing"), result)
// Test invalid url
_, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720s=20")
assert.Error(err)
// Test empty string
_, err = ParseHandleFromTweetUrl("")
assert.Error(err)
}
/** /**
* Should extract a user handle from a shortened tweet URL * Should extract a user handle from a shortened tweet URL
*/ */