Add tweet parsing

2021-05-22 18:20:18 -04:00 · 2021-05-22 18:20:18 -04:00 · 2c8fe25e78
commit 2c8fe25e78
parent 88086c5b47
7 changed files with 353 additions and 2 deletions
--- a/doc/curl
+++ b/doc/curl
@ -3,13 +3,25 @@ curl -X POST \
    https://api.twitter.com/1.1/guest/activate.json


+#
+# A user profile:
 curl \
    -H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
-    -H "X-Guest-Token: 1391174194361217029" \
+    -H "X-Guest-Token: 1396177150890348547" \
    https://api.twitter.com/graphql/4S2ihIKfF3xhp-ENxvUAfQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22michaelmalice%22%2C%22withHighlightedLabel%22%3Atrue%7D


+#
+# A user's feed:
 curl \
    -H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
-    -H "X-Guest-Token: 1391214296126967816" \
+    -H "X-Guest-Token: 1396177150890348547" \
    https://api.twitter.com/2/timeline/profile/44067298.json
+
+
+#
+# A tweet and replies (conversation):
+curl \
+    -H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \
+    -H "X-Guest-Token: 1396177150890348547" \
+    https://twitter.com/i/api/2/timeline/conversation/1395881699142160387.json
--- a/scraper/api_types.go
+++ b/scraper/api_types.go
@ -0,0 +1,114 @@
+package scraper
+
+import "time"
+
+type APITweet struct {
+	ID                string `json:"id_str"`
+	ConversationIDStr string `json:"conversation_id_str"`
+	CreatedAt         string `json:"created_at"`
+	FavoriteCount     int    `json:"favorite_count"`
+	FullText          string `json:"full_text"`
+	Entities          struct {
+		Hashtags []struct {
+			Text string `json:"text"`
+		} `json:"hashtags"`
+		Media []struct {
+			MediaURLHttps string `json:"media_url_https"`
+			Type          string `json:"type"`
+			URL           string `json:"url"`
+		} `json:"media"`
+		URLs []struct {
+			ExpandedURL string `json:"expanded_url"`
+			URL         string `json:"url"`
+		} `json:"urls"`
+		Mentions []struct {
+			UserName string `json:"screen_name"`
+			UserID   string `json:"id_str"`
+		}
+	} `json:"entities"`
+	ExtendedEntities struct {
+		Media []struct {
+			IDStr         string `json:"id_str"`
+			MediaURLHttps string `json:"media_url_https"`
+			Type          string `json:"type"`
+			VideoInfo     struct {
+				Variants []struct {
+					Bitrate int    `json:"bitrate,omitempty"`
+					URL     string `json:"url"`
+				} `json:"variants"`
+			} `json:"video_info"`
+		} `json:"media"`
+	} `json:"extended_entities"`
+	InReplyToStatusIDStr string    `json:"in_reply_to_status_id_str"`
+	InReplyToScreenName  string    `json:"in_reply_to_screen_name"`
+	ReplyCount           int       `json:"reply_count"`
+	RetweetCount         int       `json:"retweet_count"`
+	QuoteCount           int       `json:"quote_count"`
+	RetweetedStatusIDStr string    `json:"retweeted_status_id_str"`
+	QuotedStatusIDStr    string    `json:"quoted_status_id_str"`
+	Time                 time.Time `json:"time"`
+	UserIDStr            string    `json:"user_id_str"`
+}
+
+type TweetResponse struct {
+	GlobalObjects struct {
+		Tweets map[string]APITweet `json:"tweets"`
+		Users  map[string]struct {
+			CreatedAt   string `json:"created_at"`
+			Description string `json:"description"`
+			Entities    struct {
+				URL struct {
+					Urls []struct {
+						ExpandedURL string `json:"expanded_url"`
+					} `json:"urls"`
+				} `json:"url"`
+			} `json:"entities"`
+			FavouritesCount      int      `json:"favourites_count"`
+			FollowersCount       int      `json:"followers_count"`
+			FriendsCount         int      `json:"friends_count"`
+			IDStr                string   `json:"id_str"`
+			ListedCount          int      `json:"listed_count"`
+			Name                 string   `json:"name"`
+			Location             string   `json:"location"`
+			PinnedTweetIdsStr    []string `json:"pinned_tweet_ids_str"`
+			ProfileBannerURL     string   `json:"profile_banner_url"`
+			ProfileImageURLHTTPS string   `json:"profile_image_url_https"`
+			Protected            bool     `json:"protected"`
+			ScreenName           string   `json:"screen_name"`
+			StatusesCount        int      `json:"statuses_count"`
+			Verified             bool     `json:"verified"`
+		} `json:"users"`
+	} `json:"globalObjects"`
+}
+
+type UserResponse struct {
+	Data struct {
+		User struct {
+			ID     string `json:"rest_id"`
+			Legacy struct {
+				CreatedAt   string `json:"created_at"`
+				Description string `json:"description"`
+				Entities    struct {
+					URL struct {
+						Urls []struct {
+							ExpandedURL string `json:"expanded_url"`
+						} `json:"urls"`
+					} `json:"url"`
+				} `json:"entities"`
+				FavouritesCount      int      `json:"favourites_count"`
+				FollowersCount       int      `json:"followers_count"`
+				FriendsCount         int      `json:"friends_count"`
+				ListedCount          int      `json:"listed_count"`
+				Name                 string   `json:"name"`
+				Location             string   `json:"location"`
+				PinnedTweetIdsStr    []string `json:"pinned_tweet_ids_str"`
+				ProfileBannerURL     string   `json:"profile_banner_url"`
+				ProfileImageURLHTTPS string   `json:"profile_image_url_https"`
+				Protected            bool     `json:"protected"`
+				ScreenName           string   `json:"screen_name"`
+				StatusesCount        int      `json:"statuses_count"`
+				Verified             bool     `json:"verified"`
+			} `json:"legacy"`
+		} `json:"user"`
+	} `json:"data"`
+}
--- a/scraper/test_responses/dave_smith_anarchist_handbook.json
+++ b/scraper/test_responses/dave_smith_anarchist_handbook.json
--- a/scraper/test_responses/midriffs_anarchist_cookbook.json
+++ b/scraper/test_responses/midriffs_anarchist_cookbook.json
--- a/scraper/tweet.go
+++ b/scraper/tweet.go
@ -0,0 +1,94 @@
+package scraper
+
+import (
+	"time"
+	"fmt"
+	"strings"
+)
+
+
+type TweetID string
+
+type Tweet struct {
+	ID             TweetID
+	User           UserID
+	Text           string
+	PostedAt       time.Time
+	NumLikes       int
+	NumRetweets    int
+	NumReplies     int
+	NumQuoteTweets int
+	HasVideo       bool
+	InReplyTo      TweetID
+
+	Urls        []string
+	Images      []string
+	Mentions    []UserID
+	Hashtags    []string
+	QuotedTweet TweetID
+}
+
+func (t Tweet) String() string {
+	return fmt.Sprintf(
+`ID %s, User %s: %q (%s). Likes: %d, Retweets: %d, QTs: %d, Replies: %d.
+Urls: %v   Images: %v   Mentions: %v   Hashtags: %v`,
+	t.ID, t.User, t.Text, t.PostedAt, t.NumLikes, t.NumRetweets, t.NumQuoteTweets, t.NumReplies, t.Urls, t.Images, t.Mentions, t.Hashtags)
+}
+
+func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
+	ret.ID = TweetID(apiTweet.ID)
+	ret.User = UserID(apiTweet.UserIDStr)
+	ret.Text = apiTweet.FullText
+
+	// Remove embedded links at the end of the text
+	if len(apiTweet.Entities.URLs) == 1 {
+		url := apiTweet.Entities.URLs[0].URL
+		if strings.Index(ret.Text, url) == len(ret.Text) - len(url) {
+			ret.Text = ret.Text[0:len(ret.Text) - len(url) - 1]  // Also strip the newline
+		}
+	}
+	if len(apiTweet.Entities.Media) == 1 {
+		url := apiTweet.Entities.Media[0].URL
+		if strings.Index(ret.Text, url) == len(ret.Text) - len(url) {
+			ret.Text = ret.Text[0:len(ret.Text) - len(url) - 1]  // Also strip the trailing space
+		}
+	}
+
+	// Remove leading `@username` for replies
+	if apiTweet.InReplyToScreenName != "" {
+		if strings.Index(ret.Text, "@" + apiTweet.InReplyToScreenName) == 0 {
+			ret.Text = ret.Text[len(apiTweet.InReplyToScreenName) + 2:]  // `@`, username, space
+		}
+	}
+
+	ret.PostedAt, err = time.Parse(time.RubyDate, apiTweet.CreatedAt)
+	if err != nil {
+		return
+	}
+	ret.NumLikes = apiTweet.FavoriteCount
+	ret.NumRetweets = apiTweet.RetweetCount
+	ret.NumReplies = apiTweet.ReplyCount
+	ret.NumQuoteTweets = apiTweet.QuoteCount
+	ret.InReplyTo = TweetID(apiTweet.InReplyToStatusIDStr)
+
+	for _, url := range apiTweet.Entities.URLs {
+		ret.Urls = append(ret.Urls, url.ExpandedURL)
+	}
+	for _, media := range apiTweet.Entities.Media {
+		if media.Type != "photo" {
+			panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)
+			panic(panic_str)
+		}
+		ret.Images = append(ret.Images, media.MediaURLHttps)
+	}
+	for _, hashtag := range apiTweet.Entities.Hashtags {
+		ret.Hashtags = append(ret.Hashtags, hashtag.Text)
+	}
+	for _, mention := range apiTweet.Entities.Mentions {
+		ret.Mentions = append(ret.Mentions, UserID(mention.UserID))
+	}
+
+	ret.QuotedTweet = TweetID(apiTweet.QuotedStatusIDStr)
+	ret.HasVideo = false  // TODO
+	return
+}
--- a/scraper/tweet_test.go
+++ b/scraper/tweet_test.go
@ -0,0 +1,126 @@
+package scraper_test
+
+import (
+	// "fmt"
+	"encoding/json"
+	"io/ioutil"
+	"testing"
+
+	"offline_twitter/scraper"
+)
+
+func TestParseSingleTweet(t *testing.T) {
+	data, err := ioutil.ReadFile("test_responses/dave_smith_anarchist_handbook.json")
+	if err != nil {
+		panic(err)
+	}
+	var tweet_resp scraper.TweetResponse
+	err = json.Unmarshal(data, &tweet_resp)
+	if err != nil {
+		t.Errorf(err.Error())
+	}
+
+	tweets := tweet_resp.GlobalObjects.Tweets
+	users := tweet_resp.GlobalObjects.Users
+
+	if len(tweets) != 11 {
+		t.Errorf("Expected %d tweets, got %d instead", 11, len(tweets))
+	}
+
+	if len(users) != 11 {
+		t.Errorf("Expected %d users, got %d instead", 11, len(users))
+	}
+
+	dave_smith_tweet, ok := tweets["1395881699142160387"]
+	if !ok {
+		t.Errorf("Didn't find the Dave Smith tweet.")
+	}
+
+	tweet, err := scraper.ParseSingleTweet(dave_smith_tweet)
+	if err != nil {
+		t.Fatalf(err.Error())
+	}
+
+	expected_text := "The fact that @michaelmalice new book ‘The Anarchist Handbook’ is just absolutely destroying on the charts is the largest white pill I’ve swallowed in years."
+	actual_text := tweet.Text
+
+	if actual_text != expected_text {
+		t.Errorf("Expected: %q; got %q", expected_text, actual_text)
+	}
+}
+
+func TestParseSingleTweet2(t *testing.T) {
+	data, err := ioutil.ReadFile("test_responses/midriffs_anarchist_cookbook.json")
+	if err != nil {
+		panic(err)
+	}
+	var tweet_resp scraper.TweetResponse
+	err = json.Unmarshal(data, &tweet_resp)
+	if err != nil {
+		t.Errorf(err.Error())
+	}
+
+	tweets := tweet_resp.GlobalObjects.Tweets
+	users := tweet_resp.GlobalObjects.Users
+
+	if len(tweets) != 12 {
+		t.Errorf("Expected %d tweets, got %d instead", 11, len(tweets))
+	}
+
+	if len(users) != 11 {
+		t.Errorf("Expected %d users, got %d instead", 11, len(users))
+	}
+
+	t1, ok := tweets["1395882872729477131"]
+	if !ok {
+		t.Fatalf("Didn't find first tweet")
+	}
+	t2, ok := tweets["1396194922009661441"]
+	if !ok {
+		t.Fatalf("Didn't find second tweet")
+	}
+
+	tweet1, err := scraper.ParseSingleTweet(t1)
+	if err != nil {
+		t.Fatalf(err.Error())
+	}
+	tweet2, err := scraper.ParseSingleTweet(t2)
+	if err != nil {
+		t.Fatalf(err.Error())
+	}
+
+	expected_text := "this saddens me every time"
+	if tweet1.Text != expected_text {
+		t.Errorf("Expected: %q, got: %q", expected_text, tweet1.Text)
+	}
+	expected_text = "sometimes they're too dimwitted to even get the wrong title right"
+	if tweet2.Text != expected_text {
+		t.Errorf("Expected: %q, got: %q", expected_text, tweet2.Text)
+	}
+
+	if len(tweet1.Images) != 1 {
+		t.Errorf("Expected 1 images but got %d", len(tweet1.Images))
+	}
+	if tweet1.QuotedTweet != "" {
+		t.Errorf("Incorrectly believes it quote-tweets %q", tweet1.QuotedTweet)
+	}
+
+	if tweet2.QuotedTweet == "" {
+		t.Errorf("Should be a quoted tweet")
+	}
+
+	quoted_tweet_, ok := tweets[string(tweet2.QuotedTweet)]
+	if !ok {
+		t.Errorf("Couldn't find the quoted tweet")
+	}
+
+	quoted_tweet, err := scraper.ParseSingleTweet(quoted_tweet_)
+	if err != nil {
+		t.Errorf(err.Error())
+	}
+
+	expected_text = "I always liked \"The Anarchist's Cookbook.\""
+	if quoted_tweet.Text != expected_text {
+		t.Errorf("Expected %q, got %q", expected_text, quoted_tweet.Text)
+	}
+}
--- a/scraper/user.go
+++ b/scraper/user.go
@ -0,0 +1,3 @@
+package scraper
+
+type UserID string