Add support for tweets with multiple URLs

This commit is contained in:
Alessio 2021-09-17 19:45:31 -07:00
parent 690dd99b8f
commit c865df8aca
3 changed files with 41 additions and 9 deletions

View File

@ -34,6 +34,7 @@ type APIExtendedMedia struct {
type APICard struct { type APICard struct {
Name string `json:"name"` Name string `json:"name"`
ShortenedUrl string `json:"url"`
BindingValues struct { BindingValues struct {
Domain struct { Domain struct {
Value string `json:"string_value"` Value string `json:"string_value"`
@ -79,8 +80,8 @@ type APITweet struct {
} `json:"hashtags"` } `json:"hashtags"`
Media []APIMedia `json:"media"` Media []APIMedia `json:"media"`
URLs []struct { URLs []struct {
ExpandedURL string `json:"expanded_url"` ExpandedURL string `json:"expanded_url"`
URL string `json:"url"` ShortenedUrl string `json:"url"`
} `json:"urls"` } `json:"urls"`
Mentions []struct { Mentions []struct {
UserName string `json:"screen_name"` UserName string `json:"screen_name"`
@ -107,7 +108,7 @@ type APITweet struct {
func (t *APITweet) NormalizeContent() { func (t *APITweet) NormalizeContent() {
// Remove embedded links at the end of the text // Remove embedded links at the end of the text
if len(t.Entities.URLs) == 1 { // TODO: should this be `>= 1`, like below? if len(t.Entities.URLs) == 1 { // TODO: should this be `>= 1`, like below?
url := t.Entities.URLs[0].URL url := t.Entities.URLs[0].ShortenedUrl
if strings.Index(t.FullText, url) == len(t.FullText) - len(url) { if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the newline t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the newline
} }

View File

@ -89,19 +89,16 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
ret.NumQuoteTweets = apiTweet.QuoteCount ret.NumQuoteTweets = apiTweet.QuoteCount
ret.InReplyTo = TweetID(apiTweet.InReplyToStatusID) ret.InReplyTo = TweetID(apiTweet.InReplyToStatusID)
for i, url := range apiTweet.Entities.URLs { for _, url := range apiTweet.Entities.URLs {
if i != 0 {
panic(fmt.Sprintf("Tweet with multiple embedded URLs: %d", apiTweet.ID))
}
var url_object Url var url_object Url
if apiTweet.Card.BindingValues.Domain.Value != "" { if apiTweet.Card.ShortenedUrl == url.ShortenedUrl {
// Using the "Domain" field to detect if there is a card
url_object = ParseAPIUrlCard(apiTweet.Card) url_object = ParseAPIUrlCard(apiTweet.Card)
} }
url_object.Text = url.ExpandedURL url_object.Text = url.ExpandedURL
url_object.TweetID = ret.ID url_object.TweetID = ret.ID
ret.Urls = append(ret.Urls, url_object) ret.Urls = append(ret.Urls, url_object)
} }
for _, media := range apiTweet.Entities.Media { for _, media := range apiTweet.Entities.Media {
if media.Type != "photo" { // TODO: remove this eventually if media.Type != "photo" { // TODO: remove this eventually
panic_str := fmt.Sprintf("Unknown media type: %q", media.Type) panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)

View File

@ -226,6 +226,40 @@ func TestParseTweetWithUrlButNoCard(t *testing.T) {
} }
} }
func TestParseTweetWithMultipleUrls(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/tweet_with_multiple_urls.json")
if err != nil {
panic(err)
}
var apitweet scraper.APITweet
err = json.Unmarshal(data, &apitweet)
if err != nil {
t.Errorf(err.Error())
}
tweet, err := scraper.ParseSingleTweet(apitweet)
if err != nil {
t.Errorf(err.Error())
}
if len(tweet.Urls) != 3 {
t.Errorf("Expected %d urls, got %d instead", 3, len(tweet.Urls))
}
if tweet.Urls[0].HasCard {
t.Errorf("Expected url not to have a card, but it does: %d", 0)
}
if tweet.Urls[1].HasCard {
t.Errorf("Expected url not to have a card, but it does: %d", 1)
}
if !tweet.Urls[2].HasCard {
t.Errorf("Expected url to have a card, but it doesn't: %d", 2)
}
expected_title := "Bidens victory came from the suburbs"
if tweet.Urls[2].Title != expected_title {
t.Errorf("Expected title to be %q, but got %q", expected_title, tweet.Urls[2].Title)
}
}
func TestParseTweetResponse(t *testing.T) { func TestParseTweetResponse(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json") data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")
if err != nil { if err != nil {