Prevent quoted-tweet URLs from being added as URLs (they should just be quote-tweets)
This commit is contained in:
parent
aa961b9ff4
commit
f22f15f3d9
@ -102,6 +102,7 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
|||||||
ret.NumReplies = apiTweet.ReplyCount
|
ret.NumReplies = apiTweet.ReplyCount
|
||||||
ret.NumQuoteTweets = apiTweet.QuoteCount
|
ret.NumQuoteTweets = apiTweet.QuoteCount
|
||||||
ret.InReplyToID = TweetID(apiTweet.InReplyToStatusID)
|
ret.InReplyToID = TweetID(apiTweet.InReplyToStatusID)
|
||||||
|
ret.QuotedTweetID = TweetID(apiTweet.QuotedStatusID)
|
||||||
|
|
||||||
// Process URLs and link previews
|
// Process URLs and link previews
|
||||||
for _, url := range apiTweet.Entities.URLs {
|
for _, url := range apiTweet.Entities.URLs {
|
||||||
@ -112,14 +113,20 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
|||||||
url_object.Text = url.ExpandedURL
|
url_object.Text = url.ExpandedURL
|
||||||
url_object.ShortText = url.ShortenedUrl
|
url_object.ShortText = url.ShortenedUrl
|
||||||
url_object.TweetID = ret.ID
|
url_object.TweetID = ret.ID
|
||||||
|
|
||||||
|
// Skip it if it's just the quoted tweet
|
||||||
|
_, id, is_ok := TryParseTweetUrl(url.ExpandedURL)
|
||||||
|
if is_ok && id == ret.QuotedTweetID {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
ret.Urls = append(ret.Urls, url_object)
|
ret.Urls = append(ret.Urls, url_object)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process images
|
// Process images
|
||||||
for _, media := range apiTweet.Entities.Media {
|
for _, media := range apiTweet.Entities.Media {
|
||||||
if media.Type != "photo" { // TODO: remove this eventually
|
if media.Type != "photo" { // TODO: remove this eventually
|
||||||
panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)
|
panic(fmt.Sprintf("Unknown media type: %q", media.Type))
|
||||||
panic(panic_str)
|
|
||||||
}
|
}
|
||||||
new_image := ParseAPIMedia(media)
|
new_image := ParseAPIMedia(media)
|
||||||
new_image.TweetID = ret.ID
|
new_image.TweetID = ret.ID
|
||||||
@ -144,7 +151,6 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ret.QuotedTweetID = TweetID(apiTweet.QuotedStatusID)
|
|
||||||
|
|
||||||
// Process videos
|
// Process videos
|
||||||
for _, entity := range apiTweet.ExtendedEntities.Media {
|
for _, entity := range apiTweet.ExtendedEntities.Media {
|
||||||
|
@ -50,6 +50,9 @@ func TestParseTweetWithImage(t *testing.T) {
|
|||||||
assert.Len(tweet.Images, 1)
|
assert.Len(tweet.Images, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ensure the fake url (link to the quoted tweet) is not parsed as a URL; it should just be ignored
|
||||||
|
*/
|
||||||
func TestParseTweetWithQuotedTweetAsLink(t *testing.T) {
|
func TestParseTweetWithQuotedTweetAsLink(t *testing.T) {
|
||||||
assert := assert.New(t)
|
assert := assert.New(t)
|
||||||
tweet := load_tweet_from_file("test_responses/single_tweets/tweet_with_quoted_tweet_as_link2.json")
|
tweet := load_tweet_from_file("test_responses/single_tweets/tweet_with_quoted_tweet_as_link2.json")
|
||||||
@ -59,6 +62,22 @@ func TestParseTweetWithQuotedTweetAsLink(t *testing.T) {
|
|||||||
assert.Equal(TweetID(1396194494710788100), tweet.QuotedTweetID)
|
assert.Equal(TweetID(1396194494710788100), tweet.QuotedTweetID)
|
||||||
assert.Empty(tweet.ReplyMentions)
|
assert.Empty(tweet.ReplyMentions)
|
||||||
assert.Empty(tweet.Polls)
|
assert.Empty(tweet.Polls)
|
||||||
|
assert.Empty(tweet.Urls)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Quote-tweets with links should work properly
|
||||||
|
*/
|
||||||
|
func TestParseTweetWithQuotedTweetAndLink(t *testing.T) {
|
||||||
|
assert := assert.New(t)
|
||||||
|
tweet := load_tweet_from_file("test_responses/single_tweets/tweet_with_quoted_tweet_and_url.json")
|
||||||
|
|
||||||
|
assert.Equal("This is video he’s talking about. Please watch. Is there a single US politician capable of doing this with the weasels and rats running American industry today?", tweet.Text)
|
||||||
|
assert.Equal(TweetID(1497997890999898115), tweet.QuotedTweetID)
|
||||||
|
|
||||||
|
assert.Len(tweet.Urls, 1)
|
||||||
|
url := tweet.Urls[0]
|
||||||
|
assert.Equal(url.Text, "https://youtu.be/VjrlTMvirVo")
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestParseTweetWithVideo(t *testing.T) {
|
func TestParseTweetWithVideo(t *testing.T) {
|
||||||
|
@ -3,6 +3,7 @@ package scraper
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"path"
|
"path"
|
||||||
|
"regexp"
|
||||||
"net/url"
|
"net/url"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -60,17 +61,50 @@ func ParseAPIUrlCard(apiCard APICard) Url {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func get_thumbnail_local_path(remote_url string) string {
|
func get_thumbnail_local_path(remote_url string) string {
|
||||||
u, err := url.Parse(remote_url)
|
u, err := url.Parse(remote_url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
if u.RawQuery == "" {
|
if u.RawQuery == "" {
|
||||||
return path.Base(u.Path)
|
return path.Base(u.Path)
|
||||||
}
|
}
|
||||||
query_params, err := url.ParseQuery(u.RawQuery)
|
query_params, err := url.ParseQuery(u.RawQuery)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0])
|
return fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0])
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given an URL, try to parse it as a tweet url.
|
||||||
|
* The bool is an `is_ok` value; true if the parse was successful, false if it didn't match
|
||||||
|
*/
|
||||||
|
func TryParseTweetUrl(url string) (UserHandle, TweetID, bool) {
|
||||||
|
r := regexp.MustCompile(`^https://twitter.com/(\w+)/status/(\d+)(?:\?.*)?$`)
|
||||||
|
matches := r.FindStringSubmatch(url)
|
||||||
|
if matches == nil {
|
||||||
|
return UserHandle(""), TweetID(0), false
|
||||||
|
}
|
||||||
|
if len(matches) != 3 { // matches[0] is the full string
|
||||||
|
panic(matches)
|
||||||
|
}
|
||||||
|
return UserHandle(matches[1]), TweetID(int_or_panic(matches[2])), true
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given a tweet URL, return the corresponding user handle.
|
||||||
|
* If tweet url is not valid, return an error.
|
||||||
|
*/
|
||||||
|
func ParseHandleFromTweetUrl(tweet_url string) (UserHandle, error) {
|
||||||
|
short_url_regex := regexp.MustCompile(`^https://t.co/\w{5,20}$`)
|
||||||
|
if short_url_regex.MatchString(tweet_url) {
|
||||||
|
tweet_url = ExpandShortUrl(tweet_url)
|
||||||
|
}
|
||||||
|
|
||||||
|
ret, _, is_ok := TryParseTweetUrl(tweet_url)
|
||||||
|
if !is_ok {
|
||||||
|
return "", fmt.Errorf("Invalid tweet url: %s", tweet_url)
|
||||||
|
}
|
||||||
|
return ret, nil
|
||||||
}
|
}
|
||||||
|
@ -94,3 +94,57 @@ func TestParseAPIUrlCardWithoutThumbnail(t *testing.T) {
|
|||||||
assert.True(url.HasCard)
|
assert.True(url.HasCard)
|
||||||
assert.False(url.HasThumbnail)
|
assert.False(url.HasThumbnail)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should check if a url is a tweet url, and if so, parse it
|
||||||
|
*/
|
||||||
|
func TestParseTweetUrl(t *testing.T) {
|
||||||
|
assert:= assert.New(t)
|
||||||
|
|
||||||
|
// Test valid tweet url
|
||||||
|
url := "https://twitter.com/kanesays23/status/1429583672827465730"
|
||||||
|
handle, id, is_ok := TryParseTweetUrl(url)
|
||||||
|
assert.True(is_ok)
|
||||||
|
assert.Equal(UserHandle("kanesays23"), handle)
|
||||||
|
assert.Equal(TweetID(1429583672827465730), id)
|
||||||
|
|
||||||
|
// Test url with GET params
|
||||||
|
handle, id, is_ok = TryParseTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720?s=20")
|
||||||
|
assert.True(is_ok)
|
||||||
|
assert.Equal(UserHandle("NerdNoticing"), handle)
|
||||||
|
assert.Equal(TweetID(1263192389050654720), id)
|
||||||
|
|
||||||
|
// Test invalid url
|
||||||
|
_, _, is_ok = TryParseTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720s=20")
|
||||||
|
assert.False(is_ok)
|
||||||
|
|
||||||
|
// Test empty string
|
||||||
|
_, _, is_ok = TryParseTweetUrl("")
|
||||||
|
assert.False(is_ok)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should extract a user handle from a tweet URL, or fail if URL is invalid
|
||||||
|
*/
|
||||||
|
func TestParseHandleFromTweetUrl(t *testing.T) {
|
||||||
|
assert := assert.New(t)
|
||||||
|
|
||||||
|
// Test valid tweet url
|
||||||
|
url := "https://twitter.com/kanesays23/status/1429583672827465730"
|
||||||
|
result, err := ParseHandleFromTweetUrl(url)
|
||||||
|
assert.NoError(err)
|
||||||
|
assert.Equal(UserHandle("kanesays23"), result)
|
||||||
|
|
||||||
|
// Test url with GET params
|
||||||
|
result, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720?s=20")
|
||||||
|
assert.NoError(err)
|
||||||
|
assert.Equal(UserHandle("NerdNoticing"), result)
|
||||||
|
|
||||||
|
// Test invalid url
|
||||||
|
_, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720s=20")
|
||||||
|
assert.Error(err)
|
||||||
|
|
||||||
|
// Test empty string
|
||||||
|
_, err = ParseHandleFromTweetUrl("")
|
||||||
|
assert.Error(err)
|
||||||
|
}
|
||||||
|
@ -84,23 +84,7 @@ Joined %s
|
|||||||
return ret
|
return ret
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Given a tweet URL, return the corresponding user handle.
|
|
||||||
* If tweet url is not valid, return an error.
|
|
||||||
*/
|
|
||||||
func ParseHandleFromTweetUrl(tweet_url string) (UserHandle, error) {
|
|
||||||
short_url_regex := regexp.MustCompile(`^https://t.co/\w{5,20}$`)
|
|
||||||
if short_url_regex.MatchString(tweet_url) {
|
|
||||||
tweet_url = ExpandShortUrl(tweet_url)
|
|
||||||
}
|
|
||||||
|
|
||||||
r := regexp.MustCompile(`^https://twitter.com/(\w+)/status/\d+(?:\?.*)?$`)
|
|
||||||
matches := r.FindStringSubmatch(tweet_url)
|
|
||||||
if len(matches) != 2 { // matches[0] is the full string
|
|
||||||
return "", fmt.Errorf("Invalid tweet url: %s", tweet_url)
|
|
||||||
}
|
|
||||||
return UserHandle(matches[1]), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Unknown Users with handles are only created by direct GetUser calls (either `twitter fetch_user`
|
* Unknown Users with handles are only created by direct GetUser calls (either `twitter fetch_user`
|
||||||
|
@ -104,33 +104,6 @@ func TestParseDeletedUser(t *testing.T) {
|
|||||||
assert.Equal("default_profile.png", user.GetTinyProfileImageLocalPath())
|
assert.Equal("default_profile.png", user.GetTinyProfileImageLocalPath())
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Should extract a user handle from a tweet URL, or fail if URL is invalid
|
|
||||||
*/
|
|
||||||
func TestParseHandleFromTweetUrl(t *testing.T) {
|
|
||||||
assert := assert.New(t)
|
|
||||||
|
|
||||||
// Test valid tweet url
|
|
||||||
url := "https://twitter.com/kanesays23/status/1429583672827465730"
|
|
||||||
result, err := ParseHandleFromTweetUrl(url)
|
|
||||||
assert.NoError(err)
|
|
||||||
assert.Equal(UserHandle("kanesays23"), result)
|
|
||||||
|
|
||||||
// Test url with GET params
|
|
||||||
result, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720?s=20")
|
|
||||||
assert.NoError(err)
|
|
||||||
assert.Equal(UserHandle("NerdNoticing"), result)
|
|
||||||
|
|
||||||
// Test invalid url
|
|
||||||
_, err = ParseHandleFromTweetUrl("https://twitter.com/NerdNoticing/status/1263192389050654720s=20")
|
|
||||||
assert.Error(err)
|
|
||||||
|
|
||||||
// Test empty string
|
|
||||||
_, err = ParseHandleFromTweetUrl("")
|
|
||||||
assert.Error(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Should extract a user handle from a shortened tweet URL
|
* Should extract a user handle from a shortened tweet URL
|
||||||
*/
|
*/
|
||||||
|
Loading…
x
Reference in New Issue
Block a user