Add support for tweets with multiple URLs

This commit is contained in:
Alessio 2021-09-17 19:45:31 -07:00
parent 690dd99b8f
commit c865df8aca
3 changed files with 41 additions and 9 deletions

View File

@ -34,6 +34,7 @@ type APIExtendedMedia struct {
type APICard struct {
Name string `json:"name"`
ShortenedUrl string `json:"url"`
BindingValues struct {
Domain struct {
Value string `json:"string_value"`
@ -80,7 +81,7 @@ type APITweet struct {
Media []APIMedia `json:"media"`
URLs []struct {
ExpandedURL string `json:"expanded_url"`
URL string `json:"url"`
ShortenedUrl string `json:"url"`
} `json:"urls"`
Mentions []struct {
UserName string `json:"screen_name"`
@ -107,7 +108,7 @@ type APITweet struct {
func (t *APITweet) NormalizeContent() {
// Remove embedded links at the end of the text
if len(t.Entities.URLs) == 1 { // TODO: should this be `>= 1`, like below?
url := t.Entities.URLs[0].URL
url := t.Entities.URLs[0].ShortenedUrl
if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the newline
}

View File

@ -89,19 +89,16 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
ret.NumQuoteTweets = apiTweet.QuoteCount
ret.InReplyTo = TweetID(apiTweet.InReplyToStatusID)
for i, url := range apiTweet.Entities.URLs {
if i != 0 {
panic(fmt.Sprintf("Tweet with multiple embedded URLs: %d", apiTweet.ID))
}
for _, url := range apiTweet.Entities.URLs {
var url_object Url
if apiTweet.Card.BindingValues.Domain.Value != "" {
// Using the "Domain" field to detect if there is a card
if apiTweet.Card.ShortenedUrl == url.ShortenedUrl {
url_object = ParseAPIUrlCard(apiTweet.Card)
}
url_object.Text = url.ExpandedURL
url_object.TweetID = ret.ID
ret.Urls = append(ret.Urls, url_object)
}
for _, media := range apiTweet.Entities.Media {
if media.Type != "photo" { // TODO: remove this eventually
panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)

View File

@ -226,6 +226,40 @@ func TestParseTweetWithUrlButNoCard(t *testing.T) {
}
}
func TestParseTweetWithMultipleUrls(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/tweet_with_multiple_urls.json")
if err != nil {
panic(err)
}
var apitweet scraper.APITweet
err = json.Unmarshal(data, &apitweet)
if err != nil {
t.Errorf(err.Error())
}
tweet, err := scraper.ParseSingleTweet(apitweet)
if err != nil {
t.Errorf(err.Error())
}
if len(tweet.Urls) != 3 {
t.Errorf("Expected %d urls, got %d instead", 3, len(tweet.Urls))
}
if tweet.Urls[0].HasCard {
t.Errorf("Expected url not to have a card, but it does: %d", 0)
}
if tweet.Urls[1].HasCard {
t.Errorf("Expected url not to have a card, but it does: %d", 1)
}
if !tweet.Urls[2].HasCard {
t.Errorf("Expected url to have a card, but it doesn't: %d", 2)
}
expected_title := "Bidens victory came from the suburbs"
if tweet.Urls[2].Title != expected_title {
t.Errorf("Expected title to be %q, but got %q", expected_title, tweet.Urls[2].Title)
}
}
func TestParseTweetResponse(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")
if err != nil {