Add tweet parsing

This commit is contained in:
Alessio 2021-05-22 18:20:18 -04:00
parent 88086c5b47
commit 2c8fe25e78
7 changed files with 353 additions and 2 deletions

View File

@ -3,13 +3,25 @@ curl -X POST \
https://api.twitter.com/1.1/guest/activate.json https://api.twitter.com/1.1/guest/activate.json
#
# A user profile:
curl \ curl \
-H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \ -H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
-H "X-Guest-Token: 1391174194361217029" \ -H "X-Guest-Token: 1396177150890348547" \
https://api.twitter.com/graphql/4S2ihIKfF3xhp-ENxvUAfQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22michaelmalice%22%2C%22withHighlightedLabel%22%3Atrue%7D https://api.twitter.com/graphql/4S2ihIKfF3xhp-ENxvUAfQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22michaelmalice%22%2C%22withHighlightedLabel%22%3Atrue%7D
#
# A user's feed:
curl \ curl \
-H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \ -H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
-H "X-Guest-Token: 1391214296126967816" \ -H "X-Guest-Token: 1396177150890348547" \
https://api.twitter.com/2/timeline/profile/44067298.json https://api.twitter.com/2/timeline/profile/44067298.json
#
# A tweet and replies (conversation):
curl \
-H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
-H "X-Guest-Token: 1396177150890348547" \
https://twitter.com/i/api/2/timeline/conversation/1395881699142160387.json

114
scraper/api_types.go Normal file
View File

@ -0,0 +1,114 @@
package scraper
import "time"
type APITweet struct {
ID string `json:"id_str"`
ConversationIDStr string `json:"conversation_id_str"`
CreatedAt string `json:"created_at"`
FavoriteCount int `json:"favorite_count"`
FullText string `json:"full_text"`
Entities struct {
Hashtags []struct {
Text string `json:"text"`
} `json:"hashtags"`
Media []struct {
MediaURLHttps string `json:"media_url_https"`
Type string `json:"type"`
URL string `json:"url"`
} `json:"media"`
URLs []struct {
ExpandedURL string `json:"expanded_url"`
URL string `json:"url"`
} `json:"urls"`
Mentions []struct {
UserName string `json:"screen_name"`
UserID string `json:"id_str"`
}
} `json:"entities"`
ExtendedEntities struct {
Media []struct {
IDStr string `json:"id_str"`
MediaURLHttps string `json:"media_url_https"`
Type string `json:"type"`
VideoInfo struct {
Variants []struct {
Bitrate int `json:"bitrate,omitempty"`
URL string `json:"url"`
} `json:"variants"`
} `json:"video_info"`
} `json:"media"`
} `json:"extended_entities"`
InReplyToStatusIDStr string `json:"in_reply_to_status_id_str"`
InReplyToScreenName string `json:"in_reply_to_screen_name"`
ReplyCount int `json:"reply_count"`
RetweetCount int `json:"retweet_count"`
QuoteCount int `json:"quote_count"`
RetweetedStatusIDStr string `json:"retweeted_status_id_str"`
QuotedStatusIDStr string `json:"quoted_status_id_str"`
Time time.Time `json:"time"`
UserIDStr string `json:"user_id_str"`
}
type TweetResponse struct {
GlobalObjects struct {
Tweets map[string]APITweet `json:"tweets"`
Users map[string]struct {
CreatedAt string `json:"created_at"`
Description string `json:"description"`
Entities struct {
URL struct {
Urls []struct {
ExpandedURL string `json:"expanded_url"`
} `json:"urls"`
} `json:"url"`
} `json:"entities"`
FavouritesCount int `json:"favourites_count"`
FollowersCount int `json:"followers_count"`
FriendsCount int `json:"friends_count"`
IDStr string `json:"id_str"`
ListedCount int `json:"listed_count"`
Name string `json:"name"`
Location string `json:"location"`
PinnedTweetIdsStr []string `json:"pinned_tweet_ids_str"`
ProfileBannerURL string `json:"profile_banner_url"`
ProfileImageURLHTTPS string `json:"profile_image_url_https"`
Protected bool `json:"protected"`
ScreenName string `json:"screen_name"`
StatusesCount int `json:"statuses_count"`
Verified bool `json:"verified"`
} `json:"users"`
} `json:"globalObjects"`
}
type UserResponse struct {
Data struct {
User struct {
ID string `json:"rest_id"`
Legacy struct {
CreatedAt string `json:"created_at"`
Description string `json:"description"`
Entities struct {
URL struct {
Urls []struct {
ExpandedURL string `json:"expanded_url"`
} `json:"urls"`
} `json:"url"`
} `json:"entities"`
FavouritesCount int `json:"favourites_count"`
FollowersCount int `json:"followers_count"`
FriendsCount int `json:"friends_count"`
ListedCount int `json:"listed_count"`
Name string `json:"name"`
Location string `json:"location"`
PinnedTweetIdsStr []string `json:"pinned_tweet_ids_str"`
ProfileBannerURL string `json:"profile_banner_url"`
ProfileImageURLHTTPS string `json:"profile_image_url_https"`
Protected bool `json:"protected"`
ScreenName string `json:"screen_name"`
StatusesCount int `json:"statuses_count"`
Verified bool `json:"verified"`
} `json:"legacy"`
} `json:"user"`
} `json:"data"`
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

94
scraper/tweet.go Normal file
View File

@ -0,0 +1,94 @@
package scraper
import (
"time"
"fmt"
"strings"
)
type TweetID string
type Tweet struct {
ID TweetID
User UserID
Text string
PostedAt time.Time
NumLikes int
NumRetweets int
NumReplies int
NumQuoteTweets int
HasVideo bool
InReplyTo TweetID
Urls []string
Images []string
Mentions []UserID
Hashtags []string
QuotedTweet TweetID
}
func (t Tweet) String() string {
return fmt.Sprintf(
`ID %s, User %s: %q (%s). Likes: %d, Retweets: %d, QTs: %d, Replies: %d.
Urls: %v Images: %v Mentions: %v Hashtags: %v`,
t.ID, t.User, t.Text, t.PostedAt, t.NumLikes, t.NumRetweets, t.NumQuoteTweets, t.NumReplies, t.Urls, t.Images, t.Mentions, t.Hashtags)
}
func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
ret.ID = TweetID(apiTweet.ID)
ret.User = UserID(apiTweet.UserIDStr)
ret.Text = apiTweet.FullText
// Remove embedded links at the end of the text
if len(apiTweet.Entities.URLs) == 1 {
url := apiTweet.Entities.URLs[0].URL
if strings.Index(ret.Text, url) == len(ret.Text) - len(url) {
ret.Text = ret.Text[0:len(ret.Text) - len(url) - 1] // Also strip the newline
}
}
if len(apiTweet.Entities.Media) == 1 {
url := apiTweet.Entities.Media[0].URL
if strings.Index(ret.Text, url) == len(ret.Text) - len(url) {
ret.Text = ret.Text[0:len(ret.Text) - len(url) - 1] // Also strip the trailing space
}
}
// Remove leading `@username` for replies
if apiTweet.InReplyToScreenName != "" {
if strings.Index(ret.Text, "@" + apiTweet.InReplyToScreenName) == 0 {
ret.Text = ret.Text[len(apiTweet.InReplyToScreenName) + 2:] // `@`, username, space
}
}
ret.PostedAt, err = time.Parse(time.RubyDate, apiTweet.CreatedAt)
if err != nil {
return
}
ret.NumLikes = apiTweet.FavoriteCount
ret.NumRetweets = apiTweet.RetweetCount
ret.NumReplies = apiTweet.ReplyCount
ret.NumQuoteTweets = apiTweet.QuoteCount
ret.InReplyTo = TweetID(apiTweet.InReplyToStatusIDStr)
for _, url := range apiTweet.Entities.URLs {
ret.Urls = append(ret.Urls, url.ExpandedURL)
}
for _, media := range apiTweet.Entities.Media {
if media.Type != "photo" {
panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)
panic(panic_str)
}
ret.Images = append(ret.Images, media.MediaURLHttps)
}
for _, hashtag := range apiTweet.Entities.Hashtags {
ret.Hashtags = append(ret.Hashtags, hashtag.Text)
}
for _, mention := range apiTweet.Entities.Mentions {
ret.Mentions = append(ret.Mentions, UserID(mention.UserID))
}
ret.QuotedTweet = TweetID(apiTweet.QuotedStatusIDStr)
ret.HasVideo = false // TODO
return
}

126
scraper/tweet_test.go Normal file
View File

@ -0,0 +1,126 @@
package scraper_test
import (
// "fmt"
"encoding/json"
"io/ioutil"
"testing"
"offline_twitter/scraper"
)
func TestParseSingleTweet(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/dave_smith_anarchist_handbook.json")
if err != nil {
panic(err)
}
var tweet_resp scraper.TweetResponse
err = json.Unmarshal(data, &tweet_resp)
if err != nil {
t.Errorf(err.Error())
}
tweets := tweet_resp.GlobalObjects.Tweets
users := tweet_resp.GlobalObjects.Users
if len(tweets) != 11 {
t.Errorf("Expected %d tweets, got %d instead", 11, len(tweets))
}
if len(users) != 11 {
t.Errorf("Expected %d users, got %d instead", 11, len(users))
}
dave_smith_tweet, ok := tweets["1395881699142160387"]
if !ok {
t.Errorf("Didn't find the Dave Smith tweet.")
}
tweet, err := scraper.ParseSingleTweet(dave_smith_tweet)
if err != nil {
t.Fatalf(err.Error())
}
expected_text := "The fact that @michaelmalice new book The Anarchist Handbook is just absolutely destroying on the charts is the largest white pill Ive swallowed in years."
actual_text := tweet.Text
if actual_text != expected_text {
t.Errorf("Expected: %q; got %q", expected_text, actual_text)
}
}
func TestParseSingleTweet2(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/midriffs_anarchist_cookbook.json")
if err != nil {
panic(err)
}
var tweet_resp scraper.TweetResponse
err = json.Unmarshal(data, &tweet_resp)
if err != nil {
t.Errorf(err.Error())
}
tweets := tweet_resp.GlobalObjects.Tweets
users := tweet_resp.GlobalObjects.Users
if len(tweets) != 12 {
t.Errorf("Expected %d tweets, got %d instead", 11, len(tweets))
}
if len(users) != 11 {
t.Errorf("Expected %d users, got %d instead", 11, len(users))
}
t1, ok := tweets["1395882872729477131"]
if !ok {
t.Fatalf("Didn't find first tweet")
}
t2, ok := tweets["1396194922009661441"]
if !ok {
t.Fatalf("Didn't find second tweet")
}
tweet1, err := scraper.ParseSingleTweet(t1)
if err != nil {
t.Fatalf(err.Error())
}
tweet2, err := scraper.ParseSingleTweet(t2)
if err != nil {
t.Fatalf(err.Error())
}
expected_text := "this saddens me every time"
if tweet1.Text != expected_text {
t.Errorf("Expected: %q, got: %q", expected_text, tweet1.Text)
}
expected_text = "sometimes they're too dimwitted to even get the wrong title right"
if tweet2.Text != expected_text {
t.Errorf("Expected: %q, got: %q", expected_text, tweet2.Text)
}
if len(tweet1.Images) != 1 {
t.Errorf("Expected 1 images but got %d", len(tweet1.Images))
}
if tweet1.QuotedTweet != "" {
t.Errorf("Incorrectly believes it quote-tweets %q", tweet1.QuotedTweet)
}
if tweet2.QuotedTweet == "" {
t.Errorf("Should be a quoted tweet")
}
quoted_tweet_, ok := tweets[string(tweet2.QuotedTweet)]
if !ok {
t.Errorf("Couldn't find the quoted tweet")
}
quoted_tweet, err := scraper.ParseSingleTweet(quoted_tweet_)
if err != nil {
t.Errorf(err.Error())
}
expected_text = "I always liked \"The Anarchist's Cookbook.\""
if quoted_tweet.Text != expected_text {
t.Errorf("Expected %q, got %q", expected_text, quoted_tweet.Text)
}
}

3
scraper/user.go Normal file
View File

@ -0,0 +1,3 @@
package scraper
type UserID string