Add parsing of new GraphQL twitter api for user feeds

This commit is contained in:
Alessio 2022-01-24 20:13:17 -08:00
parent 8250725d2c
commit fb421215df
9 changed files with 656 additions and 0 deletions

310
scraper/api_types_v2.go Normal file
View File

@ -0,0 +1,310 @@
package scraper
import (
"fmt"
"net/http"
"net/url"
"io/ioutil"
"time"
"encoding/json"
"strings"
)
type APIV2Tweet struct {
APITweet
RetweetedStatusResult struct {
Result struct {
ID int `json:"rest_id,string"`
Legacy APITweet `json:"legacy"`
Core struct {
UserResults struct {
Result struct {
ID int64 `json:"rest_id,string"`
Legacy APIUser `json:"legacy"`
} `json:"result"`
} `json:"user_results"`
} `json:"core"`
QuotedStatusResult struct {
Result struct {
ID int64 `json:"rest_id,string"`
Legacy APITweet `json:"legacy"`
Core struct {
UserResults struct {
Result struct {
ID int64 `json:"rest_id,string"`
Legacy APIUser `json:"legacy"`
} `json:"result"`
} `json:"user_results"`
} `json:"core"`
} `json:"result"`
} `json:"quoted_status_result"`
} `json:"result"`
} `json:"retweeted_status_result"`
}
type APIV2Response struct {
Data struct {
User struct {
Result struct {
Timeline struct {
Timeline struct {
Instructions []struct {
Type string `json:"type"`
Entries []struct {
EntryID string `json:"entryId"`
SortIndex int64 `json:"sortIndex,string"`
Content struct {
ItemContent struct {
EntryType string `json:"entryType"`
TweetResults struct {
Result struct {
Legacy APIV2Tweet `json:"legacy"`
Core struct {
UserResults struct {
Result struct {
ID int64 `json:"rest_id,string"`
Legacy APIUser `json:"legacy"`
} `json:"result"`
} `json:"user_results"`
} `json:"core"`
QuotedStatusResult struct { // Same as "Result"
Result struct {
ID int64 `json:"rest_id,string"`
Legacy APIV2Tweet `json:"legacy"`
Core struct {
UserResults struct {
Result struct {
ID int64 `json:"rest_id,string"`
Legacy APIUser `json:"legacy"`
} `json:"result"`
} `json:"user_results"`
} `json:"core"`
} `json:"result"`
} `json:"quoted_status_result"`
} `json:"result"`
} `json:"tweet_results"`
} `json:"itemContent"`
// Cursors
EntryType string `json:"entryType"`
Value string `json:"value"`
CursorType string `json:"cursorType"`
} `json:"content"`
} `json:"entries"`
} `json:"instructions"`
} `json:"timeline"`
} `json:"timeline"`
} `json:"result"`
} `json:"user"`
} `json:"data"`
}
func (api_response APIV2Response) GetCursorBottom() string {
entries := api_response.Data.User.Result.Timeline.Timeline.Instructions[0].Entries
last_entry := entries[len(entries) - 1]
if last_entry.Content.CursorType != "Bottom" {
panic("No bottom cursor found")
}
return last_entry.Content.Value
}
/**
* Parse the collected API response and turn it into a TweetTrove
*/
func (api_response APIV2Response) ToTweetTrove() (TweetTrove, error) {
ret := NewTweetTrove()
for _, entry := range api_response.Data.User.Result.Timeline.Timeline.Instructions[0].Entries { // TODO: the second Instruction is the pinned tweet
if !strings.HasPrefix(entry.EntryID, "tweet-") {
// println(entry.EntryID)
continue
}
result := entry.Content.ItemContent.TweetResults.Result
apiv2_tweet := result.Legacy
apiv2_user_result := result.Core.UserResults.Result
apiv2_retweeted_tweet_result := apiv2_tweet.RetweetedStatusResult.Result
apiv2_retweeted_tweet_user := apiv2_retweeted_tweet_result.Core.UserResults.Result
apiv2_retweeted_quoted_result := apiv2_retweeted_tweet_result.QuotedStatusResult.Result
apiv2_retweeted_quoted_user := apiv2_retweeted_quoted_result.Core.UserResults.Result
apiv2_quoted_tweet_result := result.QuotedStatusResult.Result
apiv2_quoted_user_result := apiv2_quoted_tweet_result.Core.UserResults.Result
// Handle case of retweet (main tweet doesn't get parsed other than retweeted_at)
if apiv2_retweeted_tweet_result.ID != 0 {
orig_tweet, err := ParseSingleTweet(apiv2_retweeted_tweet_result.Legacy)
if err != nil {
return TweetTrove{}, err
}
ret.Tweets[orig_tweet.ID] = orig_tweet
orig_user, err := ParseSingleUser(apiv2_retweeted_tweet_user.Legacy)
if err != nil {
return TweetTrove{}, err
}
orig_user.ID = UserID(apiv2_retweeted_tweet_user.ID)
ret.Users[orig_user.ID] = orig_user
retweeting_user, err := ParseSingleUser(apiv2_user_result.Legacy)
if err != nil {
return TweetTrove{}, err
}
retweeting_user.ID = UserID(apiv2_user_result.ID)
ret.Users[retweeting_user.ID] = retweeting_user
retweet := Retweet{}
retweet.RetweetID = TweetID(apiv2_tweet.ID)
retweet.TweetID = TweetID(orig_tweet.ID)
retweet.RetweetedByID = retweeting_user.ID
retweet.RetweetedAt, err = time.Parse(time.RubyDate, apiv2_tweet.CreatedAt)
if err != nil {
fmt.Printf("%v\n", apiv2_tweet)
panic(err)
}
ret.Retweets[retweet.RetweetID] = retweet
// Handle quoted tweet
if apiv2_retweeted_quoted_result.ID != 0 {
quoted_tweet, err := ParseSingleTweet(apiv2_retweeted_quoted_result.Legacy)
if err != nil {
return TweetTrove{}, err
}
ret.Tweets[quoted_tweet.ID] = quoted_tweet
quoted_user, err := ParseSingleUser(apiv2_retweeted_quoted_user.Legacy)
if err != nil {
return TweetTrove{}, err
}
quoted_user.ID = UserID(apiv2_retweeted_quoted_user.ID)
ret.Users[quoted_user.ID] = quoted_user
}
continue
}
// The main tweet
tweet, err := ParseSingleTweet(apiv2_tweet.APITweet)
if err != nil {
return TweetTrove{}, err
}
ret.Tweets[tweet.ID] = tweet
user, err := ParseSingleUser(apiv2_user_result.Legacy)
if err != nil {
return TweetTrove{}, err
}
user.ID = UserID(apiv2_user_result.ID)
ret.Users[user.ID] = user
// Handle quoted tweet
if apiv2_quoted_tweet_result.ID != 0 {
quoted_tweet, err := ParseSingleTweet(apiv2_quoted_tweet_result.Legacy.APITweet)
if err != nil {
return TweetTrove{}, err
}
ret.Tweets[quoted_tweet.ID] = quoted_tweet
quoted_user, err := ParseSingleUser(apiv2_quoted_user_result.Legacy)
if err != nil {
return TweetTrove{}, err
}
quoted_user.ID = UserID(apiv2_quoted_user_result.ID)
ret.Users[quoted_user.ID] = quoted_user
}
}
return ret, nil
}
func get_graphql_user_timeline_url(user_id UserID, cursor string) string {
if cursor != "" {
return "https://twitter.com/i/api/graphql/CwLU7qTfeu0doqhSr6tW4A/UserTweetsAndReplies?variables=%7B%22userId%22%3A%22" + fmt.Sprint(user_id) + "%22%2C%22count%22%3A40%2C%22cursor%22%3A%22" + url.QueryEscape(cursor) + "%22%2C%22includePromotedContent%22%3Atrue%2C%22withCommunity%22%3Atrue%2C%22withSuperFollowsUserFields%22%3Atrue%2C%22withBirdwatchPivots%22%3Afalse%2C%22withDownvotePerspective%22%3Afalse%2C%22withReactionsMetadata%22%3Afalse%2C%22withReactionsPerspective%22%3Afalse%2C%22withSuperFollowsTweetFields%22%3Atrue%2C%22withVoice%22%3Atrue%2C%22withV2Timeline%22%3Afalse%2C%22__fs_interactive_text%22%3Afalse%2C%22__fs_responsive_web_uc_gql_enabled%22%3Afalse%2C%22__fs_dont_mention_me_view_api_enabled%22%3Afalse%7D"
}
return "https://twitter.com/i/api/graphql/CwLU7qTfeu0doqhSr6tW4A/UserTweetsAndReplies?variables=%7B%22userId%22%3A%22" + fmt.Sprint(user_id) + "%22%2C%22count%22%3A40%2C%22includePromotedContent%22%3Afalse%2C%22withCommunity%22%3Atrue%2C%22withSuperFollowsUserFields%22%3Atrue%2C%22withBirdwatchPivots%22%3Afalse%2C%22withDownvotePerspective%22%3Afalse%2C%22withReactionsMetadata%22%3Afalse%2C%22withReactionsPerspective%22%3Afalse%2C%22withSuperFollowsTweetFields%22%3Atrue%2C%22withVoice%22%3Atrue%2C%22withV2Timeline%22%3Afalse%2C%22__fs_interactive_text%22%3Afalse%2C%22__fs_dont_mention_me_view_api_enabled%22%3Afalse%7D"
}
/**
* Get a User feed using the new GraphQL twitter api
*/
func (api API) GetGraphqlFeedFor(user_id UserID, cursor string) (APIV2Response, error) {
client := &http.Client{Timeout: 10 * time.Second}
req, err := http.NewRequest("GET", get_graphql_user_timeline_url(user_id, cursor), nil)
if err != nil {
return APIV2Response{}, err
}
err = ApiRequestAddTokens(req)
if err != nil {
return APIV2Response{}, err
}
if cursor != "" {
UpdateQueryCursor(req, cursor, false)
}
resp, err := client.Do(req)
if err != nil {
return APIV2Response{}, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
content, _ := ioutil.ReadAll(resp.Body)
s := ""
for header := range resp.Header {
s += fmt.Sprintf(" %s: %s\n", header, resp.Header.Get(header))
}
return APIV2Response{}, fmt.Errorf("HTTP %s\n%s\n%s", resp.Status, s, content)
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return APIV2Response{}, err
}
fmt.Println(string(body))
var response APIV2Response
err = json.Unmarshal(body, &response)
return response, err
}
/**
* Resend the request to get more tweets if necessary
*
* args:
* - user_id: the user's UserID
* - response: an "out" parameter; the APIV2Response that tweets, RTs and users will be appended to
* - min_tweets: the desired minimum amount of tweets to get
*/
func (api API) GetMoreTweetsFromGraphqlFeed(user_id UserID, response *APIV2Response, min_tweets int) error {
// TODO user-feed-infinite-fetch: what if you reach the end of the user's timeline? Might loop
// forever getting no new tweets
last_response := response
for last_response.GetCursorBottom() != "" && len(response.Data.User.Result.Timeline.Timeline.Instructions[0].Entries) < min_tweets {
fresh_response, err := api.GetGraphqlFeedFor(user_id, last_response.GetCursorBottom())
if err != nil {
return err
}
if fresh_response.GetCursorBottom() == last_response.GetCursorBottom() && len(fresh_response.Data.User.Result.Timeline.Timeline.Instructions[0].Entries) == 0 {
// Empty response, cursor same as previous: end of feed has been reached
return END_OF_FEED
}
if len(fresh_response.Data.User.Result.Timeline.Timeline.Instructions[0].Entries) == 0 {
// Response has a pinned tweet, but no other content: end of feed has been reached
return END_OF_FEED // TODO: check that there actually is a pinned tweet and the request didn't just fail lol
}
last_response = &fresh_response
// Copy over the entries
response.Data.User.Result.Timeline.Timeline.Instructions[0].Entries = append(
response.Data.User.Result.Timeline.Timeline.Instructions[0].Entries,
last_response.Data.User.Result.Timeline.Timeline.Instructions[0].Entries...)
fmt.Printf("Have %d entries so far\n", len(response.Data.User.Result.Timeline.Timeline.Instructions[0].Entries))
}
return nil
}

View File

@ -0,0 +1,309 @@
package scraper_test
import (
"testing"
"io/ioutil"
"encoding/json"
"offline_twitter/scraper"
)
// Check a plain old tweet
func TestAPIV2FeedSimpleTweet(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/feeds_api_v2/feed_simple_tweet.json")
if err != nil {
panic(err)
}
var feed scraper.APIV2Response
err = json.Unmarshal(data, &feed)
if err != nil {
t.Errorf(err.Error())
}
tweet_trove, err := feed.ToTweetTrove()
if err != nil {
panic(err)
}
if len(tweet_trove.Users) != 1 {
t.Errorf("Expected 1 user, got %d", len(tweet_trove.Users))
}
user := tweet_trove.Users[44067298]
if user.ID != 44067298 {
t.Errorf("Expected ID %d, got %d", 44067298, user.ID)
}
if user.DisplayName != "Michael Malice" {
t.Errorf("Expected display name %q, got %q", "Michael Malice", user.DisplayName)
}
if len(tweet_trove.Tweets) != 1 {
t.Errorf("Expected %d tweets, got %d", 1, len(tweet_trove.Tweets))
}
tweet := tweet_trove.Tweets[1485708879174508550]
if tweet.ID != 1485708879174508550 {
t.Errorf("Expected ID 1485708879174508550, got %d", tweet.ID)
}
if tweet.UserID != scraper.UserID(44067298) {
t.Errorf("Expected user ID 44067298, got %d", tweet.UserID)
}
expected_text := "If Boris Johnson is driven out of office, it wouldn't mark the first time the Tories had four PMs in a row\nThey had previously governed the UK for 13 years with 4 PMs, from 1951-1964"
if tweet.Text != expected_text {
t.Errorf("Expected text: %q, got: %q", expected_text, tweet.Text)
}
if len(tweet_trove.Retweets) != 0 {
t.Errorf("Shouldn't be any retweets")
}
}
// Check a retweet
func TestAPIV2FeedRetweet(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/feeds_api_v2/feed_simple_retweet.json")
if err != nil {
panic(err)
}
var feed scraper.APIV2Response
err = json.Unmarshal(data, &feed)
if err != nil {
t.Errorf(err.Error())
}
tweet_trove, err := feed.ToTweetTrove()
if err != nil {
panic(err)
}
// Should fetch both the retweeting and retweeted users
if len(tweet_trove.Users) != 2 {
t.Errorf("Expected %d users, got %d", 2, len(tweet_trove.Users))
}
user := tweet_trove.Users[44067298]
if user.ID != 44067298 {
t.Errorf("Expected ID %d, got %d", 44067298, user.ID)
}
if user.DisplayName != "Michael Malice" {
t.Errorf("Expected display name %q, got %q", "Michael Malice", user.DisplayName)
}
retweeted_user := tweet_trove.Users[1326229737551912960]
if retweeted_user.ID != 1326229737551912960 {
t.Errorf("Expected ID %d, got %d", 1326229737551912960, retweeted_user.ID)
}
if retweeted_user.Handle != "libsoftiktok" {
t.Errorf("Expected handle %q, got %q", "libsoftiktok", retweeted_user.Handle)
}
// Should only be 1 tweet, the retweeted one
if len(tweet_trove.Tweets) != 1 {
t.Errorf("Expected %d tweets, got %d", 1, len(tweet_trove.Tweets))
}
tweet, ok := tweet_trove.Tweets[1485694028620316673]
if !ok {
t.Fatalf("Didn't get the tweet")
}
if tweet.ID != 1485694028620316673 {
t.Errorf("Expected ID %d, got %d", 1485694028620316673, tweet.ID)
}
if tweet.UserID != scraper.UserID(1326229737551912960) {
t.Errorf("Expected user ID %d, got %d", 1326229737551912960, tweet.UserID)
}
expected_text := "More mask madness, this time in an elevator. The mask police are really nuts https://t.co/3BpvLjdJwD"
if tweet.Text != expected_text {
t.Errorf("Expected text: %q, got: %q", expected_text, tweet.Text)
}
// Should be 1 retweet
if len(tweet_trove.Retweets) != 1 {
t.Errorf("Expected %d retweets, got %d", 1, len(tweet_trove.Retweets))
}
retweet := tweet_trove.Retweets[1485699748514476037]
if retweet.RetweetID != 1485699748514476037 {
t.Errorf("Expected RetweetID %d, got %d", 1485699748514476037, retweet.RetweetID)
}
if retweet.TweetID != 1485694028620316673 {
t.Errorf("Expected TweetID 1485694028620316673, got %d", retweet.TweetID)
}
if retweet.RetweetedAt.Unix() != 1643053397 {
t.Errorf("Expected retweeted_at %d, got %d", 1643053397, retweet.RetweetedAt.Unix())
}
if retweet.RetweetedByID != scraper.UserID(44067298) {
t.Errorf("Expected retweeted_by 44067298, got %d", retweet.RetweetedByID)
}
}
// Check a quote-tweet
func TestAPIV2FeedQuoteTweet(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/feeds_api_v2/feed_quote_tweet.json")
if err != nil {
panic(err)
}
var feed scraper.APIV2Response
err = json.Unmarshal(data, &feed)
if err != nil {
t.Errorf(err.Error())
}
tweet_trove, err := feed.ToTweetTrove()
if err != nil {
panic(err)
}
// Should be 2 users: quoter and quoted
if len(tweet_trove.Users) != 2 {
t.Errorf("Expected %d users, got %d", 2, len(tweet_trove.Users))
}
quoting_user := tweet_trove.Users[44067298]
if quoting_user.ID != 44067298 {
t.Errorf("Expected quoting user ID %d, got %d", 44067298, quoting_user.ID)
}
quoted_user := tweet_trove.Users[892155218292617217]
if quoted_user.ID != 892155218292617217 {
t.Errorf("Expected quoted user ID %d, got %d", 892155218292617217, quoted_user.ID)
}
expected_quoted_bio := "Creator of Little Homes and Mooncars"
if quoted_user.Bio != expected_quoted_bio {
t.Errorf("Expected bio %q, got %q", expected_quoted_bio, quoted_user.Bio)
}
// Should be 2 tweets: quote-tweet and quoted-tweet
if len(tweet_trove.Tweets) != 2 {
t.Errorf("Expected %d tweets, got %d", 2, len(tweet_trove.Tweets))
}
quoted_tweet := tweet_trove.Tweets[1485690069079846915]
if quoted_tweet.ID != 1485690069079846915 {
t.Errorf("Expected quoted ID %d, got %d", 1485690069079846915, quoted_tweet.ID)
}
expected_quoted_text := "The Left hates the Right so much that they won't let them leave the Union. I don't get it."
if quoted_tweet.Text != expected_quoted_text {
t.Errorf("Expected text %q, got %q", expected_quoted_text, quoted_tweet.Text)
}
quote_tweet := tweet_trove.Tweets[1485690410899021826]
if quote_tweet.ID != 1485690410899021826 {
t.Errorf("Expected quoting ID %d, got %d", 1485690410899021826, quote_tweet.ID)
}
if quote_tweet.QuotedTweetID != 1485690069079846915 {
t.Errorf("Expected to be quoting tweet ID %d, got %d", 1485690069079846915, quote_tweet.QuotedTweetID)
}
// No retweets
if len(tweet_trove.Retweets) != 0 {
t.Errorf("Shouldn't be any retweets")
}
}
// Check a retweeted quote-tweet
func TestAPIV2FeedRetweetedQuoteTweet(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/feeds_api_v2/feed_retweeted_quote_tweet.json")
if err != nil {
panic(err)
}
var feed scraper.APIV2Response
err = json.Unmarshal(data, &feed)
if err != nil {
t.Errorf(err.Error())
}
tweet_trove, err := feed.ToTweetTrove()
if err != nil {
panic(err)
}
// 3 Users: quoted, quoter, and retweeter
if len(tweet_trove.Users) != 3 {
t.Errorf("Expected %d users, got %d", 3, len(tweet_trove.Users))
}
retweeting_user := tweet_trove.Users[599817378]
if retweeting_user.ID != 599817378 {
t.Errorf("Expected retweeting user ID %d, got %d", 599817378, retweeting_user.ID)
}
if retweeting_user.Website != "https://www.youtube.com/highlyrespected" {
t.Errorf("Expected RTing user website %q, got %q", "https://www.youtube.com/highlyrespected", retweeting_user.Website)
}
retweeted_user := tweet_trove.Users[1434720042193760256]
if retweeted_user.ID != 1434720042193760256 {
t.Errorf("Expected retweed user ID %d, got %d", 1434720042193760256, retweeted_user.ID)
}
if retweeted_user.FollowersCount != 17843 {
t.Errorf("Expected %d followers, got %d", 17843, retweeted_user.FollowersCount)
}
quoted_user := tweet_trove.Users[14347972]
if quoted_user.ID != 14347972 {
t.Errorf("Expected quoted user ID %d, got %d", 14347972, quoted_user.ID)
}
if quoted_user.IsVerified != true {
t.Errorf("Expected quoted user to be verified")
}
// Quoted tweet and quoting tweet
if len(tweet_trove.Tweets) != 2 {
t.Errorf("Expected %d tweets, got %d", 2, len(tweet_trove.Tweets))
}
// The retweet
if len(tweet_trove.Retweets) != 1 {
t.Errorf("Expected %d retweets, got %d", 1, len(tweet_trove.Retweets))
}
}
func TestParseAPIV2UserFeed(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/feeds_api_v2/user_feed_apiv2.json")
if err != nil {
panic(err)
}
var feed scraper.APIV2Response
err = json.Unmarshal(data, &feed)
if err != nil {
t.Errorf(err.Error())
}
tweet_trove, err := feed.ToTweetTrove()
if err != nil {
panic(err)
}
// Check users
user := tweet_trove.Users[44067298]
if user.ID != 44067298 {
t.Errorf("Expected ID %d, got %d", 44067298, user.ID)
}
if user.DisplayName != "Michael Malice" {
t.Errorf("Expected display name %q, got %q", "Michael Malice", user.DisplayName)
}
retweeted_user := tweet_trove.Users[1326229737551912960]
if retweeted_user.ID != 1326229737551912960 {
t.Errorf("Expected ID %d, got %d", 1326229737551912960, retweeted_user.ID)
}
if retweeted_user.Handle != "libsoftiktok" {
t.Errorf("Expected handle %q, got %q", "libsoftiktok", retweeted_user.Handle)
}
quote_tweeted_user := tweet_trove.Users[892155218292617217]
if quote_tweeted_user.ID != 892155218292617217 {
t.Errorf("Expected ID %d, got %d", 892155218292617217, quote_tweeted_user.ID)
}
// Check retweets
if len(tweet_trove.Retweets) != 2 {
t.Errorf("Expected %d retweets but got %d", 2, len(tweet_trove.Retweets))
}
// Test cursor-bottom
bottom_cursor := feed.GetCursorBottom()
if bottom_cursor != "HBaYgL2Fp/T7nCkAAA==" {
t.Errorf("Expected cursor %q, got %q", "HBaYgL2Fp/T7nCkAAA==", bottom_cursor)
}
println(len(tweet_trove.Users))
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

31
scraper/tweet_trove.go Normal file
View File

@ -0,0 +1,31 @@
package scraper
type TweetTrove struct {
Tweets map[TweetID]Tweet
Users map[UserID]User
Retweets map[TweetID]Retweet
}
func NewTweetTrove() TweetTrove {
ret := TweetTrove{}
ret.Tweets = make(map[TweetID]Tweet)
ret.Users = make(map[UserID]User)
ret.Retweets = make(map[TweetID]Retweet)
return ret
}
/**
* Make it compatible with previous silly interface if needed
*/
func (trove TweetTrove) Transform() (tweets []Tweet, retweets []Retweet, users []User) {
for _, val := range trove.Tweets {
tweets = append(tweets, val)
}
for _, val := range trove.Users {
users = append(users, val)
}
for _, val := range trove.Retweets {
retweets = append(retweets, val)
}
return
} // TODO: refactor until this function isn't needed anymore