Add tweet parsing
This commit is contained in:
parent
88086c5b47
commit
2c8fe25e78
@ -3,13 +3,25 @@ curl -X POST \
|
|||||||
https://api.twitter.com/1.1/guest/activate.json
|
https://api.twitter.com/1.1/guest/activate.json
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# A user profile:
|
||||||
curl \
|
curl \
|
||||||
-H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
|
-H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
|
||||||
-H "X-Guest-Token: 1391174194361217029" \
|
-H "X-Guest-Token: 1396177150890348547" \
|
||||||
https://api.twitter.com/graphql/4S2ihIKfF3xhp-ENxvUAfQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22michaelmalice%22%2C%22withHighlightedLabel%22%3Atrue%7D
|
https://api.twitter.com/graphql/4S2ihIKfF3xhp-ENxvUAfQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22michaelmalice%22%2C%22withHighlightedLabel%22%3Atrue%7D
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# A user's feed:
|
||||||
curl \
|
curl \
|
||||||
-H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
|
-H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
|
||||||
-H "X-Guest-Token: 1391214296126967816" \
|
-H "X-Guest-Token: 1396177150890348547" \
|
||||||
https://api.twitter.com/2/timeline/profile/44067298.json
|
https://api.twitter.com/2/timeline/profile/44067298.json
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# A tweet and replies (conversation):
|
||||||
|
curl \
|
||||||
|
-H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
|
||||||
|
-H "X-Guest-Token: 1396177150890348547" \
|
||||||
|
https://twitter.com/i/api/2/timeline/conversation/1395881699142160387.json
|
||||||
|
114
scraper/api_types.go
Normal file
114
scraper/api_types.go
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
package scraper
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
type APITweet struct {
|
||||||
|
ID string `json:"id_str"`
|
||||||
|
ConversationIDStr string `json:"conversation_id_str"`
|
||||||
|
CreatedAt string `json:"created_at"`
|
||||||
|
FavoriteCount int `json:"favorite_count"`
|
||||||
|
FullText string `json:"full_text"`
|
||||||
|
Entities struct {
|
||||||
|
Hashtags []struct {
|
||||||
|
Text string `json:"text"`
|
||||||
|
} `json:"hashtags"`
|
||||||
|
Media []struct {
|
||||||
|
MediaURLHttps string `json:"media_url_https"`
|
||||||
|
Type string `json:"type"`
|
||||||
|
URL string `json:"url"`
|
||||||
|
} `json:"media"`
|
||||||
|
URLs []struct {
|
||||||
|
ExpandedURL string `json:"expanded_url"`
|
||||||
|
URL string `json:"url"`
|
||||||
|
} `json:"urls"`
|
||||||
|
Mentions []struct {
|
||||||
|
UserName string `json:"screen_name"`
|
||||||
|
UserID string `json:"id_str"`
|
||||||
|
}
|
||||||
|
} `json:"entities"`
|
||||||
|
ExtendedEntities struct {
|
||||||
|
Media []struct {
|
||||||
|
IDStr string `json:"id_str"`
|
||||||
|
MediaURLHttps string `json:"media_url_https"`
|
||||||
|
Type string `json:"type"`
|
||||||
|
VideoInfo struct {
|
||||||
|
Variants []struct {
|
||||||
|
Bitrate int `json:"bitrate,omitempty"`
|
||||||
|
URL string `json:"url"`
|
||||||
|
} `json:"variants"`
|
||||||
|
} `json:"video_info"`
|
||||||
|
} `json:"media"`
|
||||||
|
} `json:"extended_entities"`
|
||||||
|
InReplyToStatusIDStr string `json:"in_reply_to_status_id_str"`
|
||||||
|
InReplyToScreenName string `json:"in_reply_to_screen_name"`
|
||||||
|
ReplyCount int `json:"reply_count"`
|
||||||
|
RetweetCount int `json:"retweet_count"`
|
||||||
|
QuoteCount int `json:"quote_count"`
|
||||||
|
RetweetedStatusIDStr string `json:"retweeted_status_id_str"`
|
||||||
|
QuotedStatusIDStr string `json:"quoted_status_id_str"`
|
||||||
|
Time time.Time `json:"time"`
|
||||||
|
UserIDStr string `json:"user_id_str"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type TweetResponse struct {
|
||||||
|
GlobalObjects struct {
|
||||||
|
Tweets map[string]APITweet `json:"tweets"`
|
||||||
|
Users map[string]struct {
|
||||||
|
CreatedAt string `json:"created_at"`
|
||||||
|
Description string `json:"description"`
|
||||||
|
Entities struct {
|
||||||
|
URL struct {
|
||||||
|
Urls []struct {
|
||||||
|
ExpandedURL string `json:"expanded_url"`
|
||||||
|
} `json:"urls"`
|
||||||
|
} `json:"url"`
|
||||||
|
} `json:"entities"`
|
||||||
|
FavouritesCount int `json:"favourites_count"`
|
||||||
|
FollowersCount int `json:"followers_count"`
|
||||||
|
FriendsCount int `json:"friends_count"`
|
||||||
|
IDStr string `json:"id_str"`
|
||||||
|
ListedCount int `json:"listed_count"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Location string `json:"location"`
|
||||||
|
PinnedTweetIdsStr []string `json:"pinned_tweet_ids_str"`
|
||||||
|
ProfileBannerURL string `json:"profile_banner_url"`
|
||||||
|
ProfileImageURLHTTPS string `json:"profile_image_url_https"`
|
||||||
|
Protected bool `json:"protected"`
|
||||||
|
ScreenName string `json:"screen_name"`
|
||||||
|
StatusesCount int `json:"statuses_count"`
|
||||||
|
Verified bool `json:"verified"`
|
||||||
|
} `json:"users"`
|
||||||
|
} `json:"globalObjects"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type UserResponse struct {
|
||||||
|
Data struct {
|
||||||
|
User struct {
|
||||||
|
ID string `json:"rest_id"`
|
||||||
|
Legacy struct {
|
||||||
|
CreatedAt string `json:"created_at"`
|
||||||
|
Description string `json:"description"`
|
||||||
|
Entities struct {
|
||||||
|
URL struct {
|
||||||
|
Urls []struct {
|
||||||
|
ExpandedURL string `json:"expanded_url"`
|
||||||
|
} `json:"urls"`
|
||||||
|
} `json:"url"`
|
||||||
|
} `json:"entities"`
|
||||||
|
FavouritesCount int `json:"favourites_count"`
|
||||||
|
FollowersCount int `json:"followers_count"`
|
||||||
|
FriendsCount int `json:"friends_count"`
|
||||||
|
ListedCount int `json:"listed_count"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Location string `json:"location"`
|
||||||
|
PinnedTweetIdsStr []string `json:"pinned_tweet_ids_str"`
|
||||||
|
ProfileBannerURL string `json:"profile_banner_url"`
|
||||||
|
ProfileImageURLHTTPS string `json:"profile_image_url_https"`
|
||||||
|
Protected bool `json:"protected"`
|
||||||
|
ScreenName string `json:"screen_name"`
|
||||||
|
StatusesCount int `json:"statuses_count"`
|
||||||
|
Verified bool `json:"verified"`
|
||||||
|
} `json:"legacy"`
|
||||||
|
} `json:"user"`
|
||||||
|
} `json:"data"`
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
1
scraper/test_responses/midriffs_anarchist_cookbook.json
Normal file
1
scraper/test_responses/midriffs_anarchist_cookbook.json
Normal file
File diff suppressed because one or more lines are too long
94
scraper/tweet.go
Normal file
94
scraper/tweet.go
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
package scraper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"time"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
type TweetID string
|
||||||
|
|
||||||
|
type Tweet struct {
|
||||||
|
ID TweetID
|
||||||
|
User UserID
|
||||||
|
Text string
|
||||||
|
PostedAt time.Time
|
||||||
|
NumLikes int
|
||||||
|
NumRetweets int
|
||||||
|
NumReplies int
|
||||||
|
NumQuoteTweets int
|
||||||
|
HasVideo bool
|
||||||
|
InReplyTo TweetID
|
||||||
|
|
||||||
|
Urls []string
|
||||||
|
Images []string
|
||||||
|
Mentions []UserID
|
||||||
|
Hashtags []string
|
||||||
|
QuotedTweet TweetID
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t Tweet) String() string {
|
||||||
|
return fmt.Sprintf(
|
||||||
|
`ID %s, User %s: %q (%s). Likes: %d, Retweets: %d, QTs: %d, Replies: %d.
|
||||||
|
Urls: %v Images: %v Mentions: %v Hashtags: %v`,
|
||||||
|
t.ID, t.User, t.Text, t.PostedAt, t.NumLikes, t.NumRetweets, t.NumQuoteTweets, t.NumReplies, t.Urls, t.Images, t.Mentions, t.Hashtags)
|
||||||
|
}
|
||||||
|
|
||||||
|
func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
||||||
|
ret.ID = TweetID(apiTweet.ID)
|
||||||
|
ret.User = UserID(apiTweet.UserIDStr)
|
||||||
|
ret.Text = apiTweet.FullText
|
||||||
|
|
||||||
|
// Remove embedded links at the end of the text
|
||||||
|
if len(apiTweet.Entities.URLs) == 1 {
|
||||||
|
url := apiTweet.Entities.URLs[0].URL
|
||||||
|
if strings.Index(ret.Text, url) == len(ret.Text) - len(url) {
|
||||||
|
ret.Text = ret.Text[0:len(ret.Text) - len(url) - 1] // Also strip the newline
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(apiTweet.Entities.Media) == 1 {
|
||||||
|
url := apiTweet.Entities.Media[0].URL
|
||||||
|
if strings.Index(ret.Text, url) == len(ret.Text) - len(url) {
|
||||||
|
ret.Text = ret.Text[0:len(ret.Text) - len(url) - 1] // Also strip the trailing space
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove leading `@username` for replies
|
||||||
|
if apiTweet.InReplyToScreenName != "" {
|
||||||
|
if strings.Index(ret.Text, "@" + apiTweet.InReplyToScreenName) == 0 {
|
||||||
|
ret.Text = ret.Text[len(apiTweet.InReplyToScreenName) + 2:] // `@`, username, space
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.PostedAt, err = time.Parse(time.RubyDate, apiTweet.CreatedAt)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
ret.NumLikes = apiTweet.FavoriteCount
|
||||||
|
ret.NumRetweets = apiTweet.RetweetCount
|
||||||
|
ret.NumReplies = apiTweet.ReplyCount
|
||||||
|
ret.NumQuoteTweets = apiTweet.QuoteCount
|
||||||
|
ret.InReplyTo = TweetID(apiTweet.InReplyToStatusIDStr)
|
||||||
|
|
||||||
|
for _, url := range apiTweet.Entities.URLs {
|
||||||
|
ret.Urls = append(ret.Urls, url.ExpandedURL)
|
||||||
|
}
|
||||||
|
for _, media := range apiTweet.Entities.Media {
|
||||||
|
if media.Type != "photo" {
|
||||||
|
panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)
|
||||||
|
panic(panic_str)
|
||||||
|
}
|
||||||
|
ret.Images = append(ret.Images, media.MediaURLHttps)
|
||||||
|
}
|
||||||
|
for _, hashtag := range apiTweet.Entities.Hashtags {
|
||||||
|
ret.Hashtags = append(ret.Hashtags, hashtag.Text)
|
||||||
|
}
|
||||||
|
for _, mention := range apiTweet.Entities.Mentions {
|
||||||
|
ret.Mentions = append(ret.Mentions, UserID(mention.UserID))
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.QuotedTweet = TweetID(apiTweet.QuotedStatusIDStr)
|
||||||
|
ret.HasVideo = false // TODO
|
||||||
|
return
|
||||||
|
}
|
126
scraper/tweet_test.go
Normal file
126
scraper/tweet_test.go
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
package scraper_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
// "fmt"
|
||||||
|
"encoding/json"
|
||||||
|
"io/ioutil"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"offline_twitter/scraper"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseSingleTweet(t *testing.T) {
|
||||||
|
data, err := ioutil.ReadFile("test_responses/dave_smith_anarchist_handbook.json")
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
var tweet_resp scraper.TweetResponse
|
||||||
|
err = json.Unmarshal(data, &tweet_resp)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf(err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
tweets := tweet_resp.GlobalObjects.Tweets
|
||||||
|
users := tweet_resp.GlobalObjects.Users
|
||||||
|
|
||||||
|
if len(tweets) != 11 {
|
||||||
|
t.Errorf("Expected %d tweets, got %d instead", 11, len(tweets))
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(users) != 11 {
|
||||||
|
t.Errorf("Expected %d users, got %d instead", 11, len(users))
|
||||||
|
}
|
||||||
|
|
||||||
|
dave_smith_tweet, ok := tweets["1395881699142160387"]
|
||||||
|
if !ok {
|
||||||
|
t.Errorf("Didn't find the Dave Smith tweet.")
|
||||||
|
}
|
||||||
|
|
||||||
|
tweet, err := scraper.ParseSingleTweet(dave_smith_tweet)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf(err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
expected_text := "The fact that @michaelmalice new book ‘The Anarchist Handbook’ is just absolutely destroying on the charts is the largest white pill I’ve swallowed in years."
|
||||||
|
actual_text := tweet.Text
|
||||||
|
|
||||||
|
if actual_text != expected_text {
|
||||||
|
t.Errorf("Expected: %q; got %q", expected_text, actual_text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseSingleTweet2(t *testing.T) {
|
||||||
|
data, err := ioutil.ReadFile("test_responses/midriffs_anarchist_cookbook.json")
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
var tweet_resp scraper.TweetResponse
|
||||||
|
err = json.Unmarshal(data, &tweet_resp)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf(err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
tweets := tweet_resp.GlobalObjects.Tweets
|
||||||
|
users := tweet_resp.GlobalObjects.Users
|
||||||
|
|
||||||
|
if len(tweets) != 12 {
|
||||||
|
t.Errorf("Expected %d tweets, got %d instead", 11, len(tweets))
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(users) != 11 {
|
||||||
|
t.Errorf("Expected %d users, got %d instead", 11, len(users))
|
||||||
|
}
|
||||||
|
|
||||||
|
t1, ok := tweets["1395882872729477131"]
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("Didn't find first tweet")
|
||||||
|
}
|
||||||
|
t2, ok := tweets["1396194922009661441"]
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("Didn't find second tweet")
|
||||||
|
}
|
||||||
|
|
||||||
|
tweet1, err := scraper.ParseSingleTweet(t1)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf(err.Error())
|
||||||
|
}
|
||||||
|
tweet2, err := scraper.ParseSingleTweet(t2)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf(err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
expected_text := "this saddens me every time"
|
||||||
|
if tweet1.Text != expected_text {
|
||||||
|
t.Errorf("Expected: %q, got: %q", expected_text, tweet1.Text)
|
||||||
|
}
|
||||||
|
expected_text = "sometimes they're too dimwitted to even get the wrong title right"
|
||||||
|
if tweet2.Text != expected_text {
|
||||||
|
t.Errorf("Expected: %q, got: %q", expected_text, tweet2.Text)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(tweet1.Images) != 1 {
|
||||||
|
t.Errorf("Expected 1 images but got %d", len(tweet1.Images))
|
||||||
|
}
|
||||||
|
if tweet1.QuotedTweet != "" {
|
||||||
|
t.Errorf("Incorrectly believes it quote-tweets %q", tweet1.QuotedTweet)
|
||||||
|
}
|
||||||
|
|
||||||
|
if tweet2.QuotedTweet == "" {
|
||||||
|
t.Errorf("Should be a quoted tweet")
|
||||||
|
}
|
||||||
|
|
||||||
|
quoted_tweet_, ok := tweets[string(tweet2.QuotedTweet)]
|
||||||
|
if !ok {
|
||||||
|
t.Errorf("Couldn't find the quoted tweet")
|
||||||
|
}
|
||||||
|
|
||||||
|
quoted_tweet, err := scraper.ParseSingleTweet(quoted_tweet_)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf(err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
expected_text = "I always liked \"The Anarchist's Cookbook.\""
|
||||||
|
if quoted_tweet.Text != expected_text {
|
||||||
|
t.Errorf("Expected %q, got %q", expected_text, quoted_tweet.Text)
|
||||||
|
}
|
||||||
|
}
|
3
scraper/user.go
Normal file
3
scraper/user.go
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
package scraper
|
||||||
|
|
||||||
|
type UserID string
|
Loading…
x
Reference in New Issue
Block a user