Add support for tweets with multiple URLs
This commit is contained in:
parent
690dd99b8f
commit
c865df8aca
@ -34,6 +34,7 @@ type APIExtendedMedia struct {
|
|||||||
|
|
||||||
type APICard struct {
|
type APICard struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
|
ShortenedUrl string `json:"url"`
|
||||||
BindingValues struct {
|
BindingValues struct {
|
||||||
Domain struct {
|
Domain struct {
|
||||||
Value string `json:"string_value"`
|
Value string `json:"string_value"`
|
||||||
@ -79,8 +80,8 @@ type APITweet struct {
|
|||||||
} `json:"hashtags"`
|
} `json:"hashtags"`
|
||||||
Media []APIMedia `json:"media"`
|
Media []APIMedia `json:"media"`
|
||||||
URLs []struct {
|
URLs []struct {
|
||||||
ExpandedURL string `json:"expanded_url"`
|
ExpandedURL string `json:"expanded_url"`
|
||||||
URL string `json:"url"`
|
ShortenedUrl string `json:"url"`
|
||||||
} `json:"urls"`
|
} `json:"urls"`
|
||||||
Mentions []struct {
|
Mentions []struct {
|
||||||
UserName string `json:"screen_name"`
|
UserName string `json:"screen_name"`
|
||||||
@ -107,7 +108,7 @@ type APITweet struct {
|
|||||||
func (t *APITweet) NormalizeContent() {
|
func (t *APITweet) NormalizeContent() {
|
||||||
// Remove embedded links at the end of the text
|
// Remove embedded links at the end of the text
|
||||||
if len(t.Entities.URLs) == 1 { // TODO: should this be `>= 1`, like below?
|
if len(t.Entities.URLs) == 1 { // TODO: should this be `>= 1`, like below?
|
||||||
url := t.Entities.URLs[0].URL
|
url := t.Entities.URLs[0].ShortenedUrl
|
||||||
if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
|
if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
|
||||||
t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the newline
|
t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the newline
|
||||||
}
|
}
|
||||||
|
@ -89,19 +89,16 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
|||||||
ret.NumQuoteTweets = apiTweet.QuoteCount
|
ret.NumQuoteTweets = apiTweet.QuoteCount
|
||||||
ret.InReplyTo = TweetID(apiTweet.InReplyToStatusID)
|
ret.InReplyTo = TweetID(apiTweet.InReplyToStatusID)
|
||||||
|
|
||||||
for i, url := range apiTweet.Entities.URLs {
|
for _, url := range apiTweet.Entities.URLs {
|
||||||
if i != 0 {
|
|
||||||
panic(fmt.Sprintf("Tweet with multiple embedded URLs: %d", apiTweet.ID))
|
|
||||||
}
|
|
||||||
var url_object Url
|
var url_object Url
|
||||||
if apiTweet.Card.BindingValues.Domain.Value != "" {
|
if apiTweet.Card.ShortenedUrl == url.ShortenedUrl {
|
||||||
// Using the "Domain" field to detect if there is a card
|
|
||||||
url_object = ParseAPIUrlCard(apiTweet.Card)
|
url_object = ParseAPIUrlCard(apiTweet.Card)
|
||||||
}
|
}
|
||||||
url_object.Text = url.ExpandedURL
|
url_object.Text = url.ExpandedURL
|
||||||
url_object.TweetID = ret.ID
|
url_object.TweetID = ret.ID
|
||||||
ret.Urls = append(ret.Urls, url_object)
|
ret.Urls = append(ret.Urls, url_object)
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, media := range apiTweet.Entities.Media {
|
for _, media := range apiTweet.Entities.Media {
|
||||||
if media.Type != "photo" { // TODO: remove this eventually
|
if media.Type != "photo" { // TODO: remove this eventually
|
||||||
panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)
|
panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)
|
||||||
|
@ -226,6 +226,40 @@ func TestParseTweetWithUrlButNoCard(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseTweetWithMultipleUrls(t *testing.T) {
|
||||||
|
data, err := ioutil.ReadFile("test_responses/tweet_with_multiple_urls.json")
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
var apitweet scraper.APITweet
|
||||||
|
err = json.Unmarshal(data, &apitweet)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf(err.Error())
|
||||||
|
}
|
||||||
|
tweet, err := scraper.ParseSingleTweet(apitweet)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf(err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(tweet.Urls) != 3 {
|
||||||
|
t.Errorf("Expected %d urls, got %d instead", 3, len(tweet.Urls))
|
||||||
|
}
|
||||||
|
if tweet.Urls[0].HasCard {
|
||||||
|
t.Errorf("Expected url not to have a card, but it does: %d", 0)
|
||||||
|
}
|
||||||
|
if tweet.Urls[1].HasCard {
|
||||||
|
t.Errorf("Expected url not to have a card, but it does: %d", 1)
|
||||||
|
}
|
||||||
|
if !tweet.Urls[2].HasCard {
|
||||||
|
t.Errorf("Expected url to have a card, but it doesn't: %d", 2)
|
||||||
|
}
|
||||||
|
expected_title := "Biden’s victory came from the suburbs"
|
||||||
|
if tweet.Urls[2].Title != expected_title {
|
||||||
|
t.Errorf("Expected title to be %q, but got %q", expected_title, tweet.Urls[2].Title)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
func TestParseTweetResponse(t *testing.T) {
|
func TestParseTweetResponse(t *testing.T) {
|
||||||
data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")
|
data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user