diff --git a/scraper/api_types.go b/scraper/api_types.go index 1a58706..baefbd6 100644 --- a/scraper/api_types.go +++ b/scraper/api_types.go @@ -34,6 +34,7 @@ type APIExtendedMedia struct { type APICard struct { Name string `json:"name"` + ShortenedUrl string `json:"url"` BindingValues struct { Domain struct { Value string `json:"string_value"` @@ -79,8 +80,8 @@ type APITweet struct { } `json:"hashtags"` Media []APIMedia `json:"media"` URLs []struct { - ExpandedURL string `json:"expanded_url"` - URL string `json:"url"` + ExpandedURL string `json:"expanded_url"` + ShortenedUrl string `json:"url"` } `json:"urls"` Mentions []struct { UserName string `json:"screen_name"` @@ -107,7 +108,7 @@ type APITweet struct { func (t *APITweet) NormalizeContent() { // Remove embedded links at the end of the text if len(t.Entities.URLs) == 1 { // TODO: should this be `>= 1`, like below? - url := t.Entities.URLs[0].URL + url := t.Entities.URLs[0].ShortenedUrl if strings.Index(t.FullText, url) == len(t.FullText) - len(url) { t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the newline } diff --git a/scraper/tweet.go b/scraper/tweet.go index 20299fd..e62251b 100644 --- a/scraper/tweet.go +++ b/scraper/tweet.go @@ -89,19 +89,16 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { ret.NumQuoteTweets = apiTweet.QuoteCount ret.InReplyTo = TweetID(apiTweet.InReplyToStatusID) - for i, url := range apiTweet.Entities.URLs { - if i != 0 { - panic(fmt.Sprintf("Tweet with multiple embedded URLs: %d", apiTweet.ID)) - } + for _, url := range apiTweet.Entities.URLs { var url_object Url - if apiTweet.Card.BindingValues.Domain.Value != "" { - // Using the "Domain" field to detect if there is a card + if apiTweet.Card.ShortenedUrl == url.ShortenedUrl { url_object = ParseAPIUrlCard(apiTweet.Card) } url_object.Text = url.ExpandedURL url_object.TweetID = ret.ID ret.Urls = append(ret.Urls, url_object) } + for _, media := range apiTweet.Entities.Media { if media.Type != "photo" { // TODO: remove this eventually panic_str := fmt.Sprintf("Unknown media type: %q", media.Type) diff --git a/scraper/tweet_test.go b/scraper/tweet_test.go index 4120ae5..0a50953 100644 --- a/scraper/tweet_test.go +++ b/scraper/tweet_test.go @@ -226,6 +226,40 @@ func TestParseTweetWithUrlButNoCard(t *testing.T) { } } +func TestParseTweetWithMultipleUrls(t *testing.T) { + data, err := ioutil.ReadFile("test_responses/tweet_with_multiple_urls.json") + if err != nil { + panic(err) + } + var apitweet scraper.APITweet + err = json.Unmarshal(data, &apitweet) + if err != nil { + t.Errorf(err.Error()) + } + tweet, err := scraper.ParseSingleTweet(apitweet) + if err != nil { + t.Errorf(err.Error()) + } + + if len(tweet.Urls) != 3 { + t.Errorf("Expected %d urls, got %d instead", 3, len(tweet.Urls)) + } + if tweet.Urls[0].HasCard { + t.Errorf("Expected url not to have a card, but it does: %d", 0) + } + if tweet.Urls[1].HasCard { + t.Errorf("Expected url not to have a card, but it does: %d", 1) + } + if !tweet.Urls[2].HasCard { + t.Errorf("Expected url to have a card, but it doesn't: %d", 2) + } + expected_title := "Biden’s victory came from the suburbs" + if tweet.Urls[2].Title != expected_title { + t.Errorf("Expected title to be %q, but got %q", expected_title, tweet.Urls[2].Title) + } +} + + func TestParseTweetResponse(t *testing.T) { data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json") if err != nil {