Add support for tweets with multiple URLs
This commit is contained in:
parent
690dd99b8f
commit
c865df8aca
@ -34,6 +34,7 @@ type APIExtendedMedia struct {
|
||||
|
||||
type APICard struct {
|
||||
Name string `json:"name"`
|
||||
ShortenedUrl string `json:"url"`
|
||||
BindingValues struct {
|
||||
Domain struct {
|
||||
Value string `json:"string_value"`
|
||||
@ -80,7 +81,7 @@ type APITweet struct {
|
||||
Media []APIMedia `json:"media"`
|
||||
URLs []struct {
|
||||
ExpandedURL string `json:"expanded_url"`
|
||||
URL string `json:"url"`
|
||||
ShortenedUrl string `json:"url"`
|
||||
} `json:"urls"`
|
||||
Mentions []struct {
|
||||
UserName string `json:"screen_name"`
|
||||
@ -107,7 +108,7 @@ type APITweet struct {
|
||||
func (t *APITweet) NormalizeContent() {
|
||||
// Remove embedded links at the end of the text
|
||||
if len(t.Entities.URLs) == 1 { // TODO: should this be `>= 1`, like below?
|
||||
url := t.Entities.URLs[0].URL
|
||||
url := t.Entities.URLs[0].ShortenedUrl
|
||||
if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
|
||||
t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the newline
|
||||
}
|
||||
|
@ -89,19 +89,16 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
||||
ret.NumQuoteTweets = apiTweet.QuoteCount
|
||||
ret.InReplyTo = TweetID(apiTweet.InReplyToStatusID)
|
||||
|
||||
for i, url := range apiTweet.Entities.URLs {
|
||||
if i != 0 {
|
||||
panic(fmt.Sprintf("Tweet with multiple embedded URLs: %d", apiTweet.ID))
|
||||
}
|
||||
for _, url := range apiTweet.Entities.URLs {
|
||||
var url_object Url
|
||||
if apiTweet.Card.BindingValues.Domain.Value != "" {
|
||||
// Using the "Domain" field to detect if there is a card
|
||||
if apiTweet.Card.ShortenedUrl == url.ShortenedUrl {
|
||||
url_object = ParseAPIUrlCard(apiTweet.Card)
|
||||
}
|
||||
url_object.Text = url.ExpandedURL
|
||||
url_object.TweetID = ret.ID
|
||||
ret.Urls = append(ret.Urls, url_object)
|
||||
}
|
||||
|
||||
for _, media := range apiTweet.Entities.Media {
|
||||
if media.Type != "photo" { // TODO: remove this eventually
|
||||
panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)
|
||||
|
@ -226,6 +226,40 @@ func TestParseTweetWithUrlButNoCard(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseTweetWithMultipleUrls(t *testing.T) {
|
||||
data, err := ioutil.ReadFile("test_responses/tweet_with_multiple_urls.json")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
var apitweet scraper.APITweet
|
||||
err = json.Unmarshal(data, &apitweet)
|
||||
if err != nil {
|
||||
t.Errorf(err.Error())
|
||||
}
|
||||
tweet, err := scraper.ParseSingleTweet(apitweet)
|
||||
if err != nil {
|
||||
t.Errorf(err.Error())
|
||||
}
|
||||
|
||||
if len(tweet.Urls) != 3 {
|
||||
t.Errorf("Expected %d urls, got %d instead", 3, len(tweet.Urls))
|
||||
}
|
||||
if tweet.Urls[0].HasCard {
|
||||
t.Errorf("Expected url not to have a card, but it does: %d", 0)
|
||||
}
|
||||
if tweet.Urls[1].HasCard {
|
||||
t.Errorf("Expected url not to have a card, but it does: %d", 1)
|
||||
}
|
||||
if !tweet.Urls[2].HasCard {
|
||||
t.Errorf("Expected url to have a card, but it doesn't: %d", 2)
|
||||
}
|
||||
expected_title := "Biden’s victory came from the suburbs"
|
||||
if tweet.Urls[2].Title != expected_title {
|
||||
t.Errorf("Expected title to be %q, but got %q", expected_title, tweet.Urls[2].Title)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
func TestParseTweetResponse(t *testing.T) {
|
||||
data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")
|
||||
if err != nil {
|
||||
|
Loading…
x
Reference in New Issue
Block a user