Add tweet parsing

2021-05-22 18:20:18 -04:00 · 2021-05-22 18:20:18 -04:00 · 2c8fe25e78
commit 2c8fe25e78
parent 88086c5b47
7 changed files with 353 additions and 2 deletions
--- a/doc/curl
+++ b/doc/curl
@ -3,13 +3,25 @@ curl -X POST \
    https://api.twitter.com/1.1/guest/activate.json
 #
 # A user profile:
 curl \
    -H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
-    -H "X-Guest-Token: 1391174194361217029" \
+    -H "X-Guest-Token: 1396177150890348547" \
    https://api.twitter.com/graphql/4S2ihIKfF3xhp-ENxvUAfQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22michaelmalice%22%2C%22withHighlightedLabel%22%3Atrue%7D
 #
 # A user's feed:
 curl \
    -H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
-    -H "X-Guest-Token: 1391214296126967816" \
+    -H "X-Guest-Token: 1396177150890348547" \
    https://api.twitter.com/2/timeline/profile/44067298.json
 #
 # A tweet and replies (conversation):
 curl \
    -H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
    -H "X-Guest-Token: 1396177150890348547" \
    https://twitter.com/i/api/2/timeline/conversation/1395881699142160387.json
--- a/scraper/api_types.go
+++ b/scraper/api_types.go
@ -0,0 +1,114 @@
 package scraper
 import "time"
 type APITweet struct {
 	ID                string `json:"id_str"`
 	ConversationIDStr string `json:"conversation_id_str"`
 	CreatedAt         string `json:"created_at"`
 	FavoriteCount     int    `json:"favorite_count"`
 	FullText          string `json:"full_text"`
 	Entities          struct {
 		Hashtags []struct {
 			Text string `json:"text"`
 		} `json:"hashtags"`
 		Media []struct {
 			MediaURLHttps string `json:"media_url_https"`
 			Type          string `json:"type"`
 			URL           string `json:"url"`
 		} `json:"media"`
 		URLs []struct {
 			ExpandedURL string `json:"expanded_url"`
 			URL         string `json:"url"`
 		} `json:"urls"`
 		Mentions []struct {
 			UserName string `json:"screen_name"`
 			UserID   string `json:"id_str"`
 		}
 	} `json:"entities"`
 	ExtendedEntities struct {
 		Media []struct {
 			IDStr         string `json:"id_str"`
 			MediaURLHttps string `json:"media_url_https"`
 			Type          string `json:"type"`
 			VideoInfo     struct {
 				Variants []struct {
 					Bitrate int    `json:"bitrate,omitempty"`
 					URL     string `json:"url"`
 				} `json:"variants"`
 			} `json:"video_info"`
 		} `json:"media"`
 	} `json:"extended_entities"`
 	InReplyToStatusIDStr string    `json:"in_reply_to_status_id_str"`
 	InReplyToScreenName  string    `json:"in_reply_to_screen_name"`
 	ReplyCount           int       `json:"reply_count"`
 	RetweetCount         int       `json:"retweet_count"`
 	QuoteCount           int       `json:"quote_count"`
 	RetweetedStatusIDStr string    `json:"retweeted_status_id_str"`
 	QuotedStatusIDStr    string    `json:"quoted_status_id_str"`
 	Time                 time.Time `json:"time"`
 	UserIDStr            string    `json:"user_id_str"`
 }
 type TweetResponse struct {
 	GlobalObjects struct {
 		Tweets map[string]APITweet `json:"tweets"`
 		Users  map[string]struct {
 			CreatedAt   string `json:"created_at"`
 			Description string `json:"description"`
 			Entities    struct {
 				URL struct {
 					Urls []struct {
 						ExpandedURL string `json:"expanded_url"`
 					} `json:"urls"`
 				} `json:"url"`
 			} `json:"entities"`
 			FavouritesCount      int      `json:"favourites_count"`
 			FollowersCount       int      `json:"followers_count"`
 			FriendsCount         int      `json:"friends_count"`
 			IDStr                string   `json:"id_str"`
 			ListedCount          int      `json:"listed_count"`
 			Name                 string   `json:"name"`
 			Location             string   `json:"location"`
 			PinnedTweetIdsStr    []string `json:"pinned_tweet_ids_str"`
 			ProfileBannerURL     string   `json:"profile_banner_url"`
 			ProfileImageURLHTTPS string   `json:"profile_image_url_https"`
 			Protected            bool     `json:"protected"`
 			ScreenName           string   `json:"screen_name"`
 			StatusesCount        int      `json:"statuses_count"`
 			Verified             bool     `json:"verified"`
 		} `json:"users"`
 	} `json:"globalObjects"`
 }
 type UserResponse struct {
 	Data struct {
 		User struct {
 			ID     string `json:"rest_id"`
 			Legacy struct {
 				CreatedAt   string `json:"created_at"`
 				Description string `json:"description"`
 				Entities    struct {
 					URL struct {
 						Urls []struct {
 							ExpandedURL string `json:"expanded_url"`
 						} `json:"urls"`
 					} `json:"url"`
 				} `json:"entities"`
 				FavouritesCount      int      `json:"favourites_count"`
 				FollowersCount       int      `json:"followers_count"`
 				FriendsCount         int      `json:"friends_count"`
 				ListedCount          int      `json:"listed_count"`
 				Name                 string   `json:"name"`
 				Location             string   `json:"location"`
 				PinnedTweetIdsStr    []string `json:"pinned_tweet_ids_str"`
 				ProfileBannerURL     string   `json:"profile_banner_url"`
 				ProfileImageURLHTTPS string   `json:"profile_image_url_https"`
 				Protected            bool     `json:"protected"`
 				ScreenName           string   `json:"screen_name"`
 				StatusesCount        int      `json:"statuses_count"`
 				Verified             bool     `json:"verified"`
 			} `json:"legacy"`
 		} `json:"user"`
 	} `json:"data"`
 }
--- a/scraper/test_responses/dave_smith_anarchist_handbook.json
+++ b/scraper/test_responses/dave_smith_anarchist_handbook.json
--- a/scraper/test_responses/midriffs_anarchist_cookbook.json
+++ b/scraper/test_responses/midriffs_anarchist_cookbook.json
--- a/scraper/tweet.go
+++ b/scraper/tweet.go
@ -0,0 +1,94 @@
 package scraper
 import (
 	"time"
 	"fmt"
 	"strings"
 )
 type TweetID string
 type Tweet struct {
 	ID             TweetID
 	User           UserID
 	Text           string
 	PostedAt       time.Time
 	NumLikes       int
 	NumRetweets    int
 	NumReplies     int
 	NumQuoteTweets int
 	HasVideo       bool
 	InReplyTo      TweetID
 	Urls        []string
 	Images      []string
 	Mentions    []UserID
 	Hashtags    []string
 	QuotedTweet TweetID
 }
 func (t Tweet) String() string {
 	return fmt.Sprintf(
 `ID %s, User %s: %q (%s). Likes: %d, Retweets: %d, QTs: %d, Replies: %d.
 Urls: %v   Images: %v   Mentions: %v   Hashtags: %v`,
 	t.ID, t.User, t.Text, t.PostedAt, t.NumLikes, t.NumRetweets, t.NumQuoteTweets, t.NumReplies, t.Urls, t.Images, t.Mentions, t.Hashtags)
 }
 func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
 	ret.ID = TweetID(apiTweet.ID)
 	ret.User = UserID(apiTweet.UserIDStr)
 	ret.Text = apiTweet.FullText
 	// Remove embedded links at the end of the text
 	if len(apiTweet.Entities.URLs) == 1 {
 		url := apiTweet.Entities.URLs[0].URL
 		if strings.Index(ret.Text, url) == len(ret.Text) - len(url) {
 			ret.Text = ret.Text[0:len(ret.Text) - len(url) - 1]  // Also strip the newline
 		}
 	}
 	if len(apiTweet.Entities.Media) == 1 {
 		url := apiTweet.Entities.Media[0].URL
 		if strings.Index(ret.Text, url) == len(ret.Text) - len(url) {
 			ret.Text = ret.Text[0:len(ret.Text) - len(url) - 1]  // Also strip the trailing space
 		}
 	}
 	// Remove leading `@username` for replies
 	if apiTweet.InReplyToScreenName != "" {
 		if strings.Index(ret.Text, "@" + apiTweet.InReplyToScreenName) == 0 {
 			ret.Text = ret.Text[len(apiTweet.InReplyToScreenName) + 2:]  // `@`, username, space
 		}
 	}
 	ret.PostedAt, err = time.Parse(time.RubyDate, apiTweet.CreatedAt)
 	if err != nil {
 		return
 	}
 	ret.NumLikes = apiTweet.FavoriteCount
 	ret.NumRetweets = apiTweet.RetweetCount
 	ret.NumReplies = apiTweet.ReplyCount
 	ret.NumQuoteTweets = apiTweet.QuoteCount
 	ret.InReplyTo = TweetID(apiTweet.InReplyToStatusIDStr)
 	for _, url := range apiTweet.Entities.URLs {
 		ret.Urls = append(ret.Urls, url.ExpandedURL)
 	}
 	for _, media := range apiTweet.Entities.Media {
 		if media.Type != "photo" {
 			panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)
 			panic(panic_str)
 		}
 		ret.Images = append(ret.Images, media.MediaURLHttps)
 	}
 	for _, hashtag := range apiTweet.Entities.Hashtags {
 		ret.Hashtags = append(ret.Hashtags, hashtag.Text)
 	}
 	for _, mention := range apiTweet.Entities.Mentions {
 		ret.Mentions = append(ret.Mentions, UserID(mention.UserID))
 	}
 	ret.QuotedTweet = TweetID(apiTweet.QuotedStatusIDStr)
 	ret.HasVideo = false  // TODO
 	return
 }
--- a/scraper/tweet_test.go
+++ b/scraper/tweet_test.go
@ -0,0 +1,126 @@
 package scraper_test
 import (
 	// "fmt"
 	"encoding/json"
 	"io/ioutil"
 	"testing"
 	"offline_twitter/scraper"
 )
 func TestParseSingleTweet(t *testing.T) {
 	data, err := ioutil.ReadFile("test_responses/dave_smith_anarchist_handbook.json")
 	if err != nil {
 		panic(err)
 	}
 	var tweet_resp scraper.TweetResponse
 	err = json.Unmarshal(data, &tweet_resp)
 	if err != nil {
 		t.Errorf(err.Error())
 	}
 	tweets := tweet_resp.GlobalObjects.Tweets
 	users := tweet_resp.GlobalObjects.Users
 	if len(tweets) != 11 {
 		t.Errorf("Expected %d tweets, got %d instead", 11, len(tweets))
 	}
 	if len(users) != 11 {
 		t.Errorf("Expected %d users, got %d instead", 11, len(users))
 	}
 	dave_smith_tweet, ok := tweets["1395881699142160387"]
 	if !ok {
 		t.Errorf("Didn't find the Dave Smith tweet.")
 	}
 	tweet, err := scraper.ParseSingleTweet(dave_smith_tweet)
 	if err != nil {
 		t.Fatalf(err.Error())
 	}
 	expected_text := "The fact that @michaelmalice new book ‘The Anarchist Handbook’ is just absolutely destroying on the charts is the largest white pill I’ve swallowed in years."
 	actual_text := tweet.Text
 	if actual_text != expected_text {
 		t.Errorf("Expected: %q; got %q", expected_text, actual_text)
 	}
 }
 func TestParseSingleTweet2(t *testing.T) {
 	data, err := ioutil.ReadFile("test_responses/midriffs_anarchist_cookbook.json")
 	if err != nil {
 		panic(err)
 	}
 	var tweet_resp scraper.TweetResponse
 	err = json.Unmarshal(data, &tweet_resp)
 	if err != nil {
 		t.Errorf(err.Error())
 	}
 	tweets := tweet_resp.GlobalObjects.Tweets
 	users := tweet_resp.GlobalObjects.Users
 	if len(tweets) != 12 {
 		t.Errorf("Expected %d tweets, got %d instead", 11, len(tweets))
 	}
 	if len(users) != 11 {
 		t.Errorf("Expected %d users, got %d instead", 11, len(users))
 	}
 	t1, ok := tweets["1395882872729477131"]
 	if !ok {
 		t.Fatalf("Didn't find first tweet")
 	}
 	t2, ok := tweets["1396194922009661441"]
 	if !ok {
 		t.Fatalf("Didn't find second tweet")
 	}
 	tweet1, err := scraper.ParseSingleTweet(t1)
 	if err != nil {
 		t.Fatalf(err.Error())
 	}
 	tweet2, err := scraper.ParseSingleTweet(t2)
 	if err != nil {
 		t.Fatalf(err.Error())
 	}
 	expected_text := "this saddens me every time"
 	if tweet1.Text != expected_text {
 		t.Errorf("Expected: %q, got: %q", expected_text, tweet1.Text)
 	}
 	expected_text = "sometimes they're too dimwitted to even get the wrong title right"
 	if tweet2.Text != expected_text {
 		t.Errorf("Expected: %q, got: %q", expected_text, tweet2.Text)
 	}
 	if len(tweet1.Images) != 1 {
 		t.Errorf("Expected 1 images but got %d", len(tweet1.Images))
 	}
 	if tweet1.QuotedTweet != "" {
 		t.Errorf("Incorrectly believes it quote-tweets %q", tweet1.QuotedTweet)
 	}
 	if tweet2.QuotedTweet == "" {
 		t.Errorf("Should be a quoted tweet")
 	}
 	quoted_tweet_, ok := tweets[string(tweet2.QuotedTweet)]
 	if !ok {
 		t.Errorf("Couldn't find the quoted tweet")
 	}
 	quoted_tweet, err := scraper.ParseSingleTweet(quoted_tweet_)
 	if err != nil {
 		t.Errorf(err.Error())
 	}
 	expected_text = "I always liked \"The Anarchist's Cookbook.\""
 	if quoted_tweet.Text != expected_text {
 		t.Errorf("Expected %q, got %q", expected_text, quoted_tweet.Text)
 	}
 }
--- a/scraper/user.go
+++ b/scraper/user.go
@ -0,0 +1,3 @@
 package scraper
 type UserID string