Add link parsing for APIv2
This commit is contained in:
parent
1bc6aec3b5
commit
9eb3e42539
@ -163,7 +163,7 @@ type APITweet struct {
|
||||
QuotedStatusIDStr string `json:"quoted_status_id_str"` // Can be empty string
|
||||
QuotedStatusID int64
|
||||
QuotedStatusPermalink struct {
|
||||
URL string `json:"url"`
|
||||
ShortURL string `json:"url"`
|
||||
ExpandedURL string `json:"expanded"`
|
||||
} `json:"quoted_status_permalink"`
|
||||
Time time.Time `json:"time"`
|
||||
@ -188,10 +188,20 @@ func (t *APITweet) NormalizeContent() {
|
||||
t.FullText = string([]rune(t.FullText)[t.DisplayTextRange[0]:t.DisplayTextRange[1]])
|
||||
}
|
||||
|
||||
// Handle short links showing up at ends of tweets
|
||||
for _, url := range t.Entities.URLs {
|
||||
index := strings.Index(t.FullText, url.ShortenedUrl)
|
||||
if index == (len(t.FullText) - len(url.ShortenedUrl)) {
|
||||
t.FullText = strings.TrimSpace(t.FullText[0:index])
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// Handle pasted tweet links that turn into quote tweets but still have a link in them
|
||||
// This is a separate case from above because we want it gone even if it's in the middle of the tweet
|
||||
if t.QuotedStatusID != 0 {
|
||||
for _, url := range t.Entities.URLs {
|
||||
if url.ShortenedUrl == t.QuotedStatusPermalink.URL {
|
||||
if url.ShortenedUrl == t.QuotedStatusPermalink.ShortURL {
|
||||
t.FullText = strings.ReplaceAll(t.FullText, url.ShortenedUrl, "")
|
||||
}
|
||||
}
|
||||
|
@ -10,6 +10,67 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
type CardValue struct {
|
||||
Type string `json:"type"`
|
||||
StringValue string `json:"string_value"`
|
||||
ImageValue struct {
|
||||
AltText string `json:"alt"`
|
||||
Height int `json:"height"`
|
||||
Width int `json:"width"`
|
||||
Url string `json:"url"`
|
||||
} `json:"image_value"`
|
||||
UserValue struct {
|
||||
ID int64 `json:"id_str,string"`
|
||||
} `json:"user_value"`
|
||||
BooleanValue bool `json:"boolean_value"`
|
||||
}
|
||||
|
||||
type APIV2Card struct {
|
||||
Legacy struct {
|
||||
BindingValues []struct {
|
||||
Key string `json:"key"`
|
||||
Value CardValue `json:"value"`
|
||||
} `json:"binding_values"`
|
||||
Name string `json:"name"`
|
||||
Url string `json:"url"`
|
||||
} `json:"legacy"`
|
||||
}
|
||||
func (card APIV2Card) ParseAsUrl() Url {
|
||||
values := make(map[string]CardValue)
|
||||
for _, obj := range card.Legacy.BindingValues {
|
||||
values[obj.Key] = obj.Value
|
||||
}
|
||||
|
||||
ret := Url{}
|
||||
ret.HasCard = true
|
||||
|
||||
ret.ShortText = card.Legacy.Url
|
||||
ret.Domain = values["domain"].StringValue
|
||||
ret.Title = values["title"].StringValue
|
||||
ret.Description = values["description"].StringValue
|
||||
ret.IsContentDownloaded = false
|
||||
ret.CreatorID = UserID(values["creator"].UserValue.ID)
|
||||
ret.SiteID = UserID(values["site"].UserValue.ID)
|
||||
|
||||
var thumbnail_url string
|
||||
if card.Legacy.Name == "summary_large_image" || card.Legacy.Name == "summary" {
|
||||
thumbnail_url = values["thumbnail_image_large"].ImageValue.Url
|
||||
} else if card.Legacy.Name == "player" {
|
||||
thumbnail_url = values["player_image_large"].ImageValue.Url
|
||||
} else {
|
||||
panic("TODO unknown card type")
|
||||
}
|
||||
|
||||
if thumbnail_url != "" {
|
||||
ret.HasThumbnail = true
|
||||
ret.ThumbnailRemoteUrl = thumbnail_url
|
||||
ret.ThumbnailLocalPath = get_thumbnail_local_path(thumbnail_url)
|
||||
ret.ThumbnailWidth = values["thumbnail_image_large"].ImageValue.Width
|
||||
ret.ThumbnailHeight = values["thumbnail_image_large"].ImageValue.Height
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
type APIV2UserResult struct {
|
||||
UserResults struct {
|
||||
Result struct {
|
||||
@ -37,6 +98,7 @@ type APIV2Result struct {
|
||||
} `json:"text"`
|
||||
} `json:"tombstone"`
|
||||
Core *APIV2UserResult `json:"core"`
|
||||
Card APIV2Card `json:"card"`
|
||||
QuotedStatusResult *APIV2Result `json:"quoted_status_result"`
|
||||
} `json:"result"`
|
||||
}
|
||||
@ -74,6 +136,25 @@ func (api_result APIV2Result) ToTweetTrove() TweetTrove {
|
||||
ret.MergeWith(quoted_trove)
|
||||
}
|
||||
|
||||
// Handle URL cards
|
||||
if api_result.Result.Card.Legacy.Name == "summary_large_image" || api_result.Result.Card.Legacy.Name == "player" {
|
||||
url := api_result.Result.Card.ParseAsUrl()
|
||||
|
||||
main_tweet := ret.Tweets[TweetID(api_result.Result.Legacy.ID)]
|
||||
found := false
|
||||
for i := range main_tweet.Urls {
|
||||
if main_tweet.Urls[i].ShortText != url.ShortText {
|
||||
continue
|
||||
}
|
||||
found = true
|
||||
url.Text = main_tweet.Urls[i].Text // Copy the expanded URL over, since the card doesn't have it in the new API
|
||||
main_tweet.Urls[i] = url
|
||||
}
|
||||
if !found {
|
||||
panic("Tweet trove doesn't contain its own main tweet")
|
||||
}
|
||||
}
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
|
@ -268,6 +268,72 @@ func TestAPIV2ParseRetweetedQuoteTweet(t *testing.T) {
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Parse a tweet with a link
|
||||
*/
|
||||
func TestAPIV2ParseTweetWithURL(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
data, err := ioutil.ReadFile("test_responses/api_v2/tweet_with_url.json")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
var tweet_result APIV2Result
|
||||
err = json.Unmarshal(data, &tweet_result)
|
||||
assert.NoError(err)
|
||||
|
||||
trove := tweet_result.ToTweetTrove()
|
||||
|
||||
assert.Equal(1, len(trove.Tweets))
|
||||
tweet, ok := trove.Tweets[1485695695025803264]
|
||||
assert.True(ok)
|
||||
assert.Equal("This led to what I discussed as \"anguish signaling,\" where progs competed in proclaiming their distress both to show they were the Good Guys but also to get the pack to regroup, akin to wolves howling.", tweet.Text)
|
||||
|
||||
assert.Equal(1, len(tweet.Urls))
|
||||
url := tweet.Urls[0]
|
||||
assert.Equal("observer.com", url.Domain)
|
||||
assert.Equal("Why Evangelical Progressives Need to Demonstrate Anguish Publicly", url.Title)
|
||||
assert.Equal("https://observer.com/2016/12/why-evangelical-progressives-need-to-demonstrate-anguish-publicly/", url.Text)
|
||||
assert.Equal("The concept of “virtue signaling” gained a great deal of currency in this past year. It’s a way to demonstrate to others that one is a good person without having to do anything", url.Description)
|
||||
assert.Equal("https://pbs.twimg.com/card_img/1485694664640507911/WsproWyP?format=jpg&name=600x600", url.ThumbnailRemoteUrl)
|
||||
assert.Equal(600, url.ThumbnailWidth)
|
||||
assert.Equal(300, url.ThumbnailHeight)
|
||||
assert.Equal(UserID(15738599), url.SiteID)
|
||||
assert.Equal(UserID(15738599), url.CreatorID)
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a tweet with a link with a "player" card
|
||||
*/
|
||||
func TestAPIV2ParseTweetWithURLPlayerCard(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
data, err := ioutil.ReadFile("test_responses/api_v2/tweet_with_url_player_card.json")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
var tweet_result APIV2Result
|
||||
err = json.Unmarshal(data, &tweet_result)
|
||||
assert.NoError(err)
|
||||
|
||||
trove := tweet_result.ToTweetTrove()
|
||||
|
||||
assert.Equal(1, len(trove.Tweets))
|
||||
tweet, ok := trove.Tweets[1485504913614327808]
|
||||
assert.True(ok)
|
||||
assert.Equal("i'll just leave this here", tweet.Text)
|
||||
|
||||
assert.Equal(1, len(tweet.Urls))
|
||||
url := tweet.Urls[0]
|
||||
assert.Equal("www.youtube.com", url.Domain)
|
||||
assert.Equal("Michael Malice on Kennedy Nov. 15, 2016", url.Title)
|
||||
assert.Equal("https://www.youtube.com/watch?v=c9TypEM1ik4&t=9s", url.Text)
|
||||
assert.Equal("Steve Bannon;", url.Description)
|
||||
assert.Equal("https://pbs.twimg.com/card_img/1485504774233415680/fsbK59th?format=jpg&name=800x320_1", url.ThumbnailRemoteUrl)
|
||||
assert.Equal(UserID(10228272), url.SiteID)
|
||||
}
|
||||
|
||||
|
||||
func TestParseAPIV2UserFeed(t *testing.T) {
|
||||
data, err := ioutil.ReadFile("test_responses/api_v2/user_feed_apiv2.json")
|
||||
if err != nil {
|
||||
|
1
scraper/test_responses/api_v2/tweet_with_url.json
Normal file
1
scraper/test_responses/api_v2/tweet_with_url.json
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user