Add tweet parsing
This commit is contained in:
parent
88086c5b47
commit
2c8fe25e78
@ -3,13 +3,25 @@ curl -X POST \
|
||||
https://api.twitter.com/1.1/guest/activate.json
|
||||
|
||||
|
||||
#
|
||||
# A user profile:
|
||||
curl \
|
||||
-H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
|
||||
-H "X-Guest-Token: 1391174194361217029" \
|
||||
-H "X-Guest-Token: 1396177150890348547" \
|
||||
https://api.twitter.com/graphql/4S2ihIKfF3xhp-ENxvUAfQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22michaelmalice%22%2C%22withHighlightedLabel%22%3Atrue%7D
|
||||
|
||||
|
||||
#
|
||||
# A user's feed:
|
||||
curl \
|
||||
-H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
|
||||
-H "X-Guest-Token: 1391214296126967816" \
|
||||
-H "X-Guest-Token: 1396177150890348547" \
|
||||
https://api.twitter.com/2/timeline/profile/44067298.json
|
||||
|
||||
|
||||
#
|
||||
# A tweet and replies (conversation):
|
||||
curl \
|
||||
-H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
|
||||
-H "X-Guest-Token: 1396177150890348547" \
|
||||
https://twitter.com/i/api/2/timeline/conversation/1395881699142160387.json
|
||||
|
114
scraper/api_types.go
Normal file
114
scraper/api_types.go
Normal file
@ -0,0 +1,114 @@
|
||||
package scraper
|
||||
|
||||
import "time"
|
||||
|
||||
type APITweet struct {
|
||||
ID string `json:"id_str"`
|
||||
ConversationIDStr string `json:"conversation_id_str"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
FavoriteCount int `json:"favorite_count"`
|
||||
FullText string `json:"full_text"`
|
||||
Entities struct {
|
||||
Hashtags []struct {
|
||||
Text string `json:"text"`
|
||||
} `json:"hashtags"`
|
||||
Media []struct {
|
||||
MediaURLHttps string `json:"media_url_https"`
|
||||
Type string `json:"type"`
|
||||
URL string `json:"url"`
|
||||
} `json:"media"`
|
||||
URLs []struct {
|
||||
ExpandedURL string `json:"expanded_url"`
|
||||
URL string `json:"url"`
|
||||
} `json:"urls"`
|
||||
Mentions []struct {
|
||||
UserName string `json:"screen_name"`
|
||||
UserID string `json:"id_str"`
|
||||
}
|
||||
} `json:"entities"`
|
||||
ExtendedEntities struct {
|
||||
Media []struct {
|
||||
IDStr string `json:"id_str"`
|
||||
MediaURLHttps string `json:"media_url_https"`
|
||||
Type string `json:"type"`
|
||||
VideoInfo struct {
|
||||
Variants []struct {
|
||||
Bitrate int `json:"bitrate,omitempty"`
|
||||
URL string `json:"url"`
|
||||
} `json:"variants"`
|
||||
} `json:"video_info"`
|
||||
} `json:"media"`
|
||||
} `json:"extended_entities"`
|
||||
InReplyToStatusIDStr string `json:"in_reply_to_status_id_str"`
|
||||
InReplyToScreenName string `json:"in_reply_to_screen_name"`
|
||||
ReplyCount int `json:"reply_count"`
|
||||
RetweetCount int `json:"retweet_count"`
|
||||
QuoteCount int `json:"quote_count"`
|
||||
RetweetedStatusIDStr string `json:"retweeted_status_id_str"`
|
||||
QuotedStatusIDStr string `json:"quoted_status_id_str"`
|
||||
Time time.Time `json:"time"`
|
||||
UserIDStr string `json:"user_id_str"`
|
||||
}
|
||||
|
||||
type TweetResponse struct {
|
||||
GlobalObjects struct {
|
||||
Tweets map[string]APITweet `json:"tweets"`
|
||||
Users map[string]struct {
|
||||
CreatedAt string `json:"created_at"`
|
||||
Description string `json:"description"`
|
||||
Entities struct {
|
||||
URL struct {
|
||||
Urls []struct {
|
||||
ExpandedURL string `json:"expanded_url"`
|
||||
} `json:"urls"`
|
||||
} `json:"url"`
|
||||
} `json:"entities"`
|
||||
FavouritesCount int `json:"favourites_count"`
|
||||
FollowersCount int `json:"followers_count"`
|
||||
FriendsCount int `json:"friends_count"`
|
||||
IDStr string `json:"id_str"`
|
||||
ListedCount int `json:"listed_count"`
|
||||
Name string `json:"name"`
|
||||
Location string `json:"location"`
|
||||
PinnedTweetIdsStr []string `json:"pinned_tweet_ids_str"`
|
||||
ProfileBannerURL string `json:"profile_banner_url"`
|
||||
ProfileImageURLHTTPS string `json:"profile_image_url_https"`
|
||||
Protected bool `json:"protected"`
|
||||
ScreenName string `json:"screen_name"`
|
||||
StatusesCount int `json:"statuses_count"`
|
||||
Verified bool `json:"verified"`
|
||||
} `json:"users"`
|
||||
} `json:"globalObjects"`
|
||||
}
|
||||
|
||||
type UserResponse struct {
|
||||
Data struct {
|
||||
User struct {
|
||||
ID string `json:"rest_id"`
|
||||
Legacy struct {
|
||||
CreatedAt string `json:"created_at"`
|
||||
Description string `json:"description"`
|
||||
Entities struct {
|
||||
URL struct {
|
||||
Urls []struct {
|
||||
ExpandedURL string `json:"expanded_url"`
|
||||
} `json:"urls"`
|
||||
} `json:"url"`
|
||||
} `json:"entities"`
|
||||
FavouritesCount int `json:"favourites_count"`
|
||||
FollowersCount int `json:"followers_count"`
|
||||
FriendsCount int `json:"friends_count"`
|
||||
ListedCount int `json:"listed_count"`
|
||||
Name string `json:"name"`
|
||||
Location string `json:"location"`
|
||||
PinnedTweetIdsStr []string `json:"pinned_tweet_ids_str"`
|
||||
ProfileBannerURL string `json:"profile_banner_url"`
|
||||
ProfileImageURLHTTPS string `json:"profile_image_url_https"`
|
||||
Protected bool `json:"protected"`
|
||||
ScreenName string `json:"screen_name"`
|
||||
StatusesCount int `json:"statuses_count"`
|
||||
Verified bool `json:"verified"`
|
||||
} `json:"legacy"`
|
||||
} `json:"user"`
|
||||
} `json:"data"`
|
||||
}
|
File diff suppressed because one or more lines are too long
1
scraper/test_responses/midriffs_anarchist_cookbook.json
Normal file
1
scraper/test_responses/midriffs_anarchist_cookbook.json
Normal file
File diff suppressed because one or more lines are too long
94
scraper/tweet.go
Normal file
94
scraper/tweet.go
Normal file
@ -0,0 +1,94 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"time"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
|
||||
type TweetID string
|
||||
|
||||
type Tweet struct {
|
||||
ID TweetID
|
||||
User UserID
|
||||
Text string
|
||||
PostedAt time.Time
|
||||
NumLikes int
|
||||
NumRetweets int
|
||||
NumReplies int
|
||||
NumQuoteTweets int
|
||||
HasVideo bool
|
||||
InReplyTo TweetID
|
||||
|
||||
Urls []string
|
||||
Images []string
|
||||
Mentions []UserID
|
||||
Hashtags []string
|
||||
QuotedTweet TweetID
|
||||
}
|
||||
|
||||
func (t Tweet) String() string {
|
||||
return fmt.Sprintf(
|
||||
`ID %s, User %s: %q (%s). Likes: %d, Retweets: %d, QTs: %d, Replies: %d.
|
||||
Urls: %v Images: %v Mentions: %v Hashtags: %v`,
|
||||
t.ID, t.User, t.Text, t.PostedAt, t.NumLikes, t.NumRetweets, t.NumQuoteTweets, t.NumReplies, t.Urls, t.Images, t.Mentions, t.Hashtags)
|
||||
}
|
||||
|
||||
func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
||||
ret.ID = TweetID(apiTweet.ID)
|
||||
ret.User = UserID(apiTweet.UserIDStr)
|
||||
ret.Text = apiTweet.FullText
|
||||
|
||||
// Remove embedded links at the end of the text
|
||||
if len(apiTweet.Entities.URLs) == 1 {
|
||||
url := apiTweet.Entities.URLs[0].URL
|
||||
if strings.Index(ret.Text, url) == len(ret.Text) - len(url) {
|
||||
ret.Text = ret.Text[0:len(ret.Text) - len(url) - 1] // Also strip the newline
|
||||
}
|
||||
}
|
||||
if len(apiTweet.Entities.Media) == 1 {
|
||||
url := apiTweet.Entities.Media[0].URL
|
||||
if strings.Index(ret.Text, url) == len(ret.Text) - len(url) {
|
||||
ret.Text = ret.Text[0:len(ret.Text) - len(url) - 1] // Also strip the trailing space
|
||||
}
|
||||
}
|
||||
|
||||
// Remove leading `@username` for replies
|
||||
if apiTweet.InReplyToScreenName != "" {
|
||||
if strings.Index(ret.Text, "@" + apiTweet.InReplyToScreenName) == 0 {
|
||||
ret.Text = ret.Text[len(apiTweet.InReplyToScreenName) + 2:] // `@`, username, space
|
||||
}
|
||||
}
|
||||
|
||||
ret.PostedAt, err = time.Parse(time.RubyDate, apiTweet.CreatedAt)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
ret.NumLikes = apiTweet.FavoriteCount
|
||||
ret.NumRetweets = apiTweet.RetweetCount
|
||||
ret.NumReplies = apiTweet.ReplyCount
|
||||
ret.NumQuoteTweets = apiTweet.QuoteCount
|
||||
ret.InReplyTo = TweetID(apiTweet.InReplyToStatusIDStr)
|
||||
|
||||
for _, url := range apiTweet.Entities.URLs {
|
||||
ret.Urls = append(ret.Urls, url.ExpandedURL)
|
||||
}
|
||||
for _, media := range apiTweet.Entities.Media {
|
||||
if media.Type != "photo" {
|
||||
panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)
|
||||
panic(panic_str)
|
||||
}
|
||||
ret.Images = append(ret.Images, media.MediaURLHttps)
|
||||
}
|
||||
for _, hashtag := range apiTweet.Entities.Hashtags {
|
||||
ret.Hashtags = append(ret.Hashtags, hashtag.Text)
|
||||
}
|
||||
for _, mention := range apiTweet.Entities.Mentions {
|
||||
ret.Mentions = append(ret.Mentions, UserID(mention.UserID))
|
||||
}
|
||||
|
||||
ret.QuotedTweet = TweetID(apiTweet.QuotedStatusIDStr)
|
||||
ret.HasVideo = false // TODO
|
||||
return
|
||||
}
|
126
scraper/tweet_test.go
Normal file
126
scraper/tweet_test.go
Normal file
@ -0,0 +1,126 @@
|
||||
package scraper_test
|
||||
|
||||
import (
|
||||
// "fmt"
|
||||
"encoding/json"
|
||||
"io/ioutil"
|
||||
"testing"
|
||||
|
||||
"offline_twitter/scraper"
|
||||
)
|
||||
|
||||
func TestParseSingleTweet(t *testing.T) {
|
||||
data, err := ioutil.ReadFile("test_responses/dave_smith_anarchist_handbook.json")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
var tweet_resp scraper.TweetResponse
|
||||
err = json.Unmarshal(data, &tweet_resp)
|
||||
if err != nil {
|
||||
t.Errorf(err.Error())
|
||||
}
|
||||
|
||||
tweets := tweet_resp.GlobalObjects.Tweets
|
||||
users := tweet_resp.GlobalObjects.Users
|
||||
|
||||
if len(tweets) != 11 {
|
||||
t.Errorf("Expected %d tweets, got %d instead", 11, len(tweets))
|
||||
}
|
||||
|
||||
if len(users) != 11 {
|
||||
t.Errorf("Expected %d users, got %d instead", 11, len(users))
|
||||
}
|
||||
|
||||
dave_smith_tweet, ok := tweets["1395881699142160387"]
|
||||
if !ok {
|
||||
t.Errorf("Didn't find the Dave Smith tweet.")
|
||||
}
|
||||
|
||||
tweet, err := scraper.ParseSingleTweet(dave_smith_tweet)
|
||||
if err != nil {
|
||||
t.Fatalf(err.Error())
|
||||
}
|
||||
|
||||
expected_text := "The fact that @michaelmalice new book ‘The Anarchist Handbook’ is just absolutely destroying on the charts is the largest white pill I’ve swallowed in years."
|
||||
actual_text := tweet.Text
|
||||
|
||||
if actual_text != expected_text {
|
||||
t.Errorf("Expected: %q; got %q", expected_text, actual_text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseSingleTweet2(t *testing.T) {
|
||||
data, err := ioutil.ReadFile("test_responses/midriffs_anarchist_cookbook.json")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
var tweet_resp scraper.TweetResponse
|
||||
err = json.Unmarshal(data, &tweet_resp)
|
||||
if err != nil {
|
||||
t.Errorf(err.Error())
|
||||
}
|
||||
|
||||
tweets := tweet_resp.GlobalObjects.Tweets
|
||||
users := tweet_resp.GlobalObjects.Users
|
||||
|
||||
if len(tweets) != 12 {
|
||||
t.Errorf("Expected %d tweets, got %d instead", 11, len(tweets))
|
||||
}
|
||||
|
||||
if len(users) != 11 {
|
||||
t.Errorf("Expected %d users, got %d instead", 11, len(users))
|
||||
}
|
||||
|
||||
t1, ok := tweets["1395882872729477131"]
|
||||
if !ok {
|
||||
t.Fatalf("Didn't find first tweet")
|
||||
}
|
||||
t2, ok := tweets["1396194922009661441"]
|
||||
if !ok {
|
||||
t.Fatalf("Didn't find second tweet")
|
||||
}
|
||||
|
||||
tweet1, err := scraper.ParseSingleTweet(t1)
|
||||
if err != nil {
|
||||
t.Fatalf(err.Error())
|
||||
}
|
||||
tweet2, err := scraper.ParseSingleTweet(t2)
|
||||
if err != nil {
|
||||
t.Fatalf(err.Error())
|
||||
}
|
||||
|
||||
expected_text := "this saddens me every time"
|
||||
if tweet1.Text != expected_text {
|
||||
t.Errorf("Expected: %q, got: %q", expected_text, tweet1.Text)
|
||||
}
|
||||
expected_text = "sometimes they're too dimwitted to even get the wrong title right"
|
||||
if tweet2.Text != expected_text {
|
||||
t.Errorf("Expected: %q, got: %q", expected_text, tweet2.Text)
|
||||
}
|
||||
|
||||
if len(tweet1.Images) != 1 {
|
||||
t.Errorf("Expected 1 images but got %d", len(tweet1.Images))
|
||||
}
|
||||
if tweet1.QuotedTweet != "" {
|
||||
t.Errorf("Incorrectly believes it quote-tweets %q", tweet1.QuotedTweet)
|
||||
}
|
||||
|
||||
if tweet2.QuotedTweet == "" {
|
||||
t.Errorf("Should be a quoted tweet")
|
||||
}
|
||||
|
||||
quoted_tweet_, ok := tweets[string(tweet2.QuotedTweet)]
|
||||
if !ok {
|
||||
t.Errorf("Couldn't find the quoted tweet")
|
||||
}
|
||||
|
||||
quoted_tweet, err := scraper.ParseSingleTweet(quoted_tweet_)
|
||||
if err != nil {
|
||||
t.Errorf(err.Error())
|
||||
}
|
||||
|
||||
expected_text = "I always liked \"The Anarchist's Cookbook.\""
|
||||
if quoted_tweet.Text != expected_text {
|
||||
t.Errorf("Expected %q, got %q", expected_text, quoted_tweet.Text)
|
||||
}
|
||||
}
|
3
scraper/user.go
Normal file
3
scraper/user.go
Normal file
@ -0,0 +1,3 @@
|
||||
package scraper
|
||||
|
||||
type UserID string
|
Loading…
x
Reference in New Issue
Block a user