Prevent quoted-tweet URLs from being added as URLs (they should just be quote-tweets)
This commit is contained in:
parent
aa961b9ff4
commit
f22f15f3d9
@ -102,6 +102,7 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
||||
ret.NumReplies = apiTweet.ReplyCount
|
||||
ret.NumQuoteTweets = apiTweet.QuoteCount
|
||||
ret.InReplyToID = TweetID(apiTweet.InReplyToStatusID)
|
||||
ret.QuotedTweetID = TweetID(apiTweet.QuotedStatusID)
|
||||
|
||||
// Process URLs and link previews
|
||||
for _, url := range apiTweet.Entities.URLs {
|
||||
@ -112,14 +113,20 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
||||
url_object.Text = url.ExpandedURL
|
||||
url_object.ShortText = url.ShortenedUrl
|
||||
url_object.TweetID = ret.ID
|
||||
|
||||
// Skip it if it's just the quoted tweet
|
||||
_, id, is_ok := TryParseTweetUrl(url.ExpandedURL)
|
||||
if is_ok && id == ret.QuotedTweetID {
|
||||
continue
|
||||
}
|
||||
|
||||
ret.Urls = append(ret.Urls, url_object)
|
||||
}
|
||||
|
||||
// Process images
|
||||
for _, media := range apiTweet.Entities.Media {
|
||||
if media.Type != "photo" { // TODO: remove this eventually
|
||||
panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)
|
||||
panic(panic_str)
|
||||
panic(fmt.Sprintf("Unknown media type: %q", media.Type))
|
||||
}
|
||||
new_image := ParseAPIMedia(media)
|
||||
new_image.TweetID = ret.ID
|
||||
@ -144,7 +151,6 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
||||
}
|
||||
}
|
||||
|
||||
ret.QuotedTweetID = TweetID(apiTweet.QuotedStatusID)
|
||||
|
||||
// Process videos
|
||||
for _, entity := range apiTweet.ExtendedEntities.Media {
|
||||
|
@ -50,6 +50,9 @@ func TestParseTweetWithImage(t *testing.T) {
|
||||
assert.Len(tweet.Images, 1)
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure the fake url (link to the quoted tweet) is not parsed as a URL; it should just be ignored
|
||||
*/
|
||||
func TestParseTweetWithQuotedTweetAsLink(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
tweet := load_tweet_from_file("test_responses/single_tweets/tweet_with_quoted_tweet_as_link2.json")
|
||||
@ -59,6 +62,22 @@ func TestParseTweetWithQuotedTweetAsLink(t *testing.T) {
|
||||
assert.Equal(TweetID(1396194494710788100), tweet.QuotedTweetID)
|
||||
assert.Empty(tweet.ReplyMentions)
|
||||
assert.Empty(tweet.Polls)
|
||||
assert.Empty(tweet.Urls)
|
||||
}
|
||||
|
||||
/**
|
||||
* Quote-tweets with links should work properly
|
||||
*/
|
||||
func TestParseTweetWithQuotedTweetAndLink(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
tweet := load_tweet_from_file("test_responses/single_tweets/tweet_with_quoted_tweet_and_url.json")
|
||||
|
||||
assert.Equal("This is video he’s talking about. Please watch. Is there a single US politician capable of doing this with the weasels and rats running American industry today?", tweet.Text)
|
||||
assert.Equal(TweetID(1497997890999898115), tweet.QuotedTweetID)
|
||||
|
||||
assert.Len(tweet.Urls, 1)
|
||||
url := tweet.Urls[0]
|
||||
assert.Equal(url.Text, "https://youtu.be/VjrlTMvirVo")
|
||||
}
|
||||
|
||||
func TestParseTweetWithVideo(t *testing.T) {
|
||||
|
@ -3,6 +3,7 @@ package scraper
|
||||
import (
|
||||
"fmt"
|
||||
"path"
|
||||
"regexp"
|
||||
"net/url"
|
||||
)
|
||||
|
||||
@ -60,17 +61,50 @@ func ParseAPIUrlCard(apiCard APICard) Url {
|
||||
}
|
||||
|
||||
func get_thumbnail_local_path(remote_url string) string {
|
||||
u, err := url.Parse(remote_url)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if u.RawQuery == "" {
|
||||
return path.Base(u.Path)
|
||||
}
|
||||
query_params, err := url.ParseQuery(u.RawQuery)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
u, err := url.Parse(remote_url)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if u.RawQuery == "" {
|
||||
return path.Base(u.Path)
|
||||
}
|
||||
query_params, err := url.ParseQuery(u.RawQuery)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0])
|
||||
return fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0])
|
||||
}
|
||||
|
||||
/**
|
||||
* Given an URL, try to parse it as a tweet url.
|
||||
* The bool is an `is_ok` value; true if the parse was successful, false if it didn't match
|
||||
*/
|
||||
func TryParseTweetUrl(url string) (UserHandle, TweetID, bool) {
|
||||
r := regexp.MustCompile(`^https://twitter.com/(\w+)/status/(\d+)(?:\?.*)?$`)
|
||||
matches := r.FindStringSubmatch(url)
|
||||
if matches == nil {
|
||||
return UserHandle(""), TweetID(0), false
|
||||
}
|
||||
if len(matches) != 3 { // matches[0] is the full string
|
||||
panic(matches)
|
||||
}
|
||||
return UserHandle(matches[1]), TweetID(int_or_panic(matches[2])), true
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a tweet URL, return the corresponding user handle.
|
||||
* If tweet url is not valid, return an error.
|
||||
*/
|
||||
func ParseHandleFromTweetUrl(tweet_url string) (UserHandle, error) {
|
||||
short_url_regex := regexp.MustCompile(`^https://t.co/\w{5,20}$`)
|
||||
if short_url_regex.MatchString(tweet_url) {
|
||||
tweet_url = ExpandShortUrl(tweet_url)
|
||||
}
|
||||
|
||||
ret, _, is_ok := TryParseTweetUrl(tweet_url)
|
||||
if !is_ok {
|
||||
return "", fmt.Errorf("Invalid tweet url: %s", tweet_url)
|
||||
}
|
||||
return ret, nil
|
||||
}
|
||||
|
@ -94,3 +94,57 @@ func TestParseAPIUrlCardWithoutThumbnail(t *testing.T) {
|
||||
assert.True(url.HasCard)
|
||||
assert.False(url.HasThumbnail)
|
||||
}
|
||||
|
||||
/**
|
||||
* Should check if a url is a tweet url, and if so, parse it
|
||||
*/
|
||||
func TestParseTweetUrl(t *testing.T) {
|
||||
assert:= assert.New(t)
|
||||
|
||||
// Test valid tweet url
|
||||
url := "https://twitter.com/kanesays23/status/1429583672827465730"
|
||||
handle, id, is_ok := TryParseTweetUrl(url)
|
||||
assert.True(is_ok)
|
||||
assert.Equal(UserHandle("kanesays23"), handle)
|
||||
assert.Equal(TweetID(1429583672827465730), id)
|
||||
|
||||
// Test url with GET params
|
||||
handle, id, is_ok = TryParseTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720?s=20")
|
||||
assert.True(is_ok)
|
||||
assert.Equal(UserHandle("NerdNoticing"), handle)
|
||||
assert.Equal(TweetID(1263192389050654720), id)
|
||||
|
||||
// Test invalid url
|
||||
_, _, is_ok = TryParseTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720s=20")
|
||||
assert.False(is_ok)
|
||||
|
||||
// Test empty string
|
||||
_, _, is_ok = TryParseTweetUrl("")
|
||||
assert.False(is_ok)
|
||||
}
|
||||
|
||||
/**
|
||||
* Should extract a user handle from a tweet URL, or fail if URL is invalid
|
||||
*/
|
||||
func TestParseHandleFromTweetUrl(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
// Test valid tweet url
|
||||
url := "https://twitter.com/kanesays23/status/1429583672827465730"
|
||||
result, err := ParseHandleFromTweetUrl(url)
|
||||
assert.NoError(err)
|
||||
assert.Equal(UserHandle("kanesays23"), result)
|
||||
|
||||
// Test url with GET params
|
||||
result, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720?s=20")
|
||||
assert.NoError(err)
|
||||
assert.Equal(UserHandle("NerdNoticing"), result)
|
||||
|
||||
// Test invalid url
|
||||
_, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720s=20")
|
||||
assert.Error(err)
|
||||
|
||||
// Test empty string
|
||||
_, err = ParseHandleFromTweetUrl("")
|
||||
assert.Error(err)
|
||||
}
|
||||
|
@ -84,23 +84,7 @@ Joined %s
|
||||
return ret
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a tweet URL, return the corresponding user handle.
|
||||
* If tweet url is not valid, return an error.
|
||||
*/
|
||||
func ParseHandleFromTweetUrl(tweet_url string) (UserHandle, error) {
|
||||
short_url_regex := regexp.MustCompile(`^https://t.co/\w{5,20}$`)
|
||||
if short_url_regex.MatchString(tweet_url) {
|
||||
tweet_url = ExpandShortUrl(tweet_url)
|
||||
}
|
||||
|
||||
r := regexp.MustCompile(`^https://twitter.com/(\w+)/status/\d+(?:\?.*)?$`)
|
||||
matches := r.FindStringSubmatch(tweet_url)
|
||||
if len(matches) != 2 { // matches[0] is the full string
|
||||
return "", fmt.Errorf("Invalid tweet url: %s", tweet_url)
|
||||
}
|
||||
return UserHandle(matches[1]), nil
|
||||
}
|
||||
|
||||
/**
|
||||
* Unknown Users with handles are only created by direct GetUser calls (either `twitter fetch_user`
|
||||
|
@ -104,33 +104,6 @@ func TestParseDeletedUser(t *testing.T) {
|
||||
assert.Equal("default_profile.png", user.GetTinyProfileImageLocalPath())
|
||||
}
|
||||
|
||||
/**
|
||||
* Should extract a user handle from a tweet URL, or fail if URL is invalid
|
||||
*/
|
||||
func TestParseHandleFromTweetUrl(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
// Test valid tweet url
|
||||
url := "https://twitter.com/kanesays23/status/1429583672827465730"
|
||||
result, err := ParseHandleFromTweetUrl(url)
|
||||
assert.NoError(err)
|
||||
assert.Equal(UserHandle("kanesays23"), result)
|
||||
|
||||
// Test url with GET params
|
||||
result, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720?s=20")
|
||||
assert.NoError(err)
|
||||
assert.Equal(UserHandle("NerdNoticing"), result)
|
||||
|
||||
// Test invalid url
|
||||
_, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720s=20")
|
||||
assert.Error(err)
|
||||
|
||||
// Test empty string
|
||||
_, err = ParseHandleFromTweetUrl("")
|
||||
assert.Error(err)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Should extract a user handle from a shortened tweet URL
|
||||
*/
|
||||
|
Loading…
x
Reference in New Issue
Block a user