diff --git a/.build.yml b/.build.yml index 8bd1a44..87bb7cf 100644 --- a/.build.yml +++ b/.build.yml @@ -43,5 +43,8 @@ tasks: cd twitter_offline_engine/scraper golangci-lint run + cd ../cmd + golangci-lint run + duration=$SECONDS echo "Task completed in $(($duration / 60))m$(($duration % 60))s." diff --git a/cmd/fetch_tweet_to_stdout.go b/cmd/fetch_tweet_to_stdout.go new file mode 100644 index 0000000..2e4a047 --- /dev/null +++ b/cmd/fetch_tweet_to_stdout.go @@ -0,0 +1,57 @@ +package main + +import ( + "os" + "fmt" + "offline_twitter/scraper" + // "time" + "log" + "strings" +) + +const INCLUDE_REPLIES = true; + +// input: e.g., "https://twitter.com/michaelmalice/status/1395882872729477131" +func parse_tweet(url string) (string, error) { + parts := strings.Split(url, "/") + if len(parts) != 6 { + return "", fmt.Errorf("Tweet format isn't right (%d)", len(parts)) + } + if parts[0] != "https:" || parts[1] != "" || parts[2] != "twitter.com" || parts[4] != "status" { + return "", fmt.Errorf("Tweet format isn't right") + } + return parts[5], nil +} + +func main() { + if len(os.Args) < 2 { + log.Fatal("Must provide tweet! Exiting...") + } + + tweet_id, err := parse_tweet(os.Args[1]) + if err != nil { + log.Fatal(err.Error()) + } + + if INCLUDE_REPLIES { + tweets, retweets, users, err := scraper.GetTweetFull(tweet_id) + if err != nil { + log.Fatal(err.Error()) + } + for _, t := range tweets { + fmt.Printf("%v\n", t) + } + for _, t := range retweets { + fmt.Printf("%v\n", t) + } + for _, u := range users { + fmt.Printf("%v\n", u) + } + } else { + tweet, err := scraper.GetTweet(tweet_id) + if err != nil { + log.Fatal(err.Error()) + } + fmt.Printf("%v\n", tweet) + } +} diff --git a/cmd/fetch_tweet_to_stdout/main.go b/cmd/fetch_tweet_to_stdout/main.go new file mode 100644 index 0000000..2e4a047 --- /dev/null +++ b/cmd/fetch_tweet_to_stdout/main.go @@ -0,0 +1,57 @@ +package main + +import ( + "os" + "fmt" + "offline_twitter/scraper" + // "time" + "log" + "strings" +) + +const INCLUDE_REPLIES = true; + +// input: e.g., "https://twitter.com/michaelmalice/status/1395882872729477131" +func parse_tweet(url string) (string, error) { + parts := strings.Split(url, "/") + if len(parts) != 6 { + return "", fmt.Errorf("Tweet format isn't right (%d)", len(parts)) + } + if parts[0] != "https:" || parts[1] != "" || parts[2] != "twitter.com" || parts[4] != "status" { + return "", fmt.Errorf("Tweet format isn't right") + } + return parts[5], nil +} + +func main() { + if len(os.Args) < 2 { + log.Fatal("Must provide tweet! Exiting...") + } + + tweet_id, err := parse_tweet(os.Args[1]) + if err != nil { + log.Fatal(err.Error()) + } + + if INCLUDE_REPLIES { + tweets, retweets, users, err := scraper.GetTweetFull(tweet_id) + if err != nil { + log.Fatal(err.Error()) + } + for _, t := range tweets { + fmt.Printf("%v\n", t) + } + for _, t := range retweets { + fmt.Printf("%v\n", t) + } + for _, u := range users { + fmt.Printf("%v\n", u) + } + } else { + tweet, err := scraper.GetTweet(tweet_id) + if err != nil { + log.Fatal(err.Error()) + } + fmt.Printf("%v\n", tweet) + } +} diff --git a/cmd/fetch_user_feed_to_stdout.go b/cmd/fetch_user_feed_to_stdout.go new file mode 100644 index 0000000..c96f769 --- /dev/null +++ b/cmd/fetch_user_feed_to_stdout.go @@ -0,0 +1,96 @@ +package main + +import ( + "os" + "fmt" + "offline_twitter/scraper" + "log" + "sort" +) + +func main() { + if len(os.Args) < 2 { + log.Fatal("Must provide a user handle! Exiting...") + } + handle := scraper.UserHandle(os.Args[1]) + + user, err := scraper.GetUser(handle) + if err != nil { + log.Fatal("Error getting user profile: " + err.Error()) + } + + tweets, retweets, users, err := scraper.GetFeedFull(user.ID, 1) + if err != nil { + log.Fatal("Error getting user feed: " + err.Error()) + } + + display_feed(user, tweets, retweets, users) + + fmt.Printf("Got a total of %d tweets, %d retweets, from %d users\n", len(tweets), len(retweets), len(users)) +} + +func display_feed(user scraper.User, tweets []scraper.Tweet, retweets []scraper.Retweet, users []scraper.User) { + sort.Slice(tweets, func(i, j int) bool { return !tweets[i].PostedAt.Before(tweets[j].PostedAt) }) + tweet_map := make(map[scraper.TweetID]scraper.Tweet) + for _, t := range tweets { + tweet_map[t.ID] = t + } + + sort.Slice(retweets, func(i, j int) bool { return !retweets[i].RetweetedAt.Before(retweets[j].RetweetedAt) }) + users_dict := make(map[scraper.UserID]scraper.User) + for _, u := range users { + users_dict[u.ID] = u + } + + i := 0 + j := 0 + for { + if i < len(tweets) && j < len(retweets) { + if !tweets[i].PostedAt.Before(retweets[j].RetweetedAt) { + tweet := tweets[i] + if tweet.User != user.ID { + i += 1 + continue + } + + user, ok := users_dict[tweet.User] + if !ok { + log.Fatalf("User not found: %q", tweet.User) + } + + print_tweet(tweets[i], user) + i += 1 + } else { + retweet := retweets[j] + if retweet.RetweetedBy != user.ID { + j += 1 + continue + } + tweet, ok := tweet_map[retweet.TweetID] + if !ok { + log.Fatalf("Tweet not found: %q", retweet.TweetID) + } + original_poster, ok := users_dict[tweet.User] + if !ok { + log.Fatalf("User not found: %q", tweet.User) + } + retweeter, ok := users_dict[retweet.RetweetedBy] + if !ok { + log.Fatalf("User not found: %q", retweet.RetweetedBy) + } + print_retweet(retweet, tweet, original_poster, retweeter) + j += 1 + } + } else { + break + } + } +} + +func print_tweet(tweet scraper.Tweet, user scraper.User) { + fmt.Printf("%s => %s\n Replies: %d Retweets: %d Likes: %d\n", user.DisplayName, tweet.Text, tweet.NumReplies, tweet.NumRetweets, tweet.NumLikes) +} + +func print_retweet(retweet scraper.Retweet, original_tweet scraper.Tweet, original_poster scraper.User, retweeter scraper.User) { + fmt.Printf("%s [retweet] %s => %s\n Replies: %d Retweets: %d Likes: %d\n", retweeter.DisplayName, original_poster.DisplayName, original_tweet.Text, original_tweet.NumReplies, original_tweet.NumRetweets, original_tweet.NumLikes) +} diff --git a/cmd/fetch_user_feed_to_stdout/main.go b/cmd/fetch_user_feed_to_stdout/main.go new file mode 100644 index 0000000..2dfb4fb --- /dev/null +++ b/cmd/fetch_user_feed_to_stdout/main.go @@ -0,0 +1,128 @@ +package main + +import ( + "os" + "fmt" + "offline_twitter/scraper" + "log" + "sort" +) + +func main() { + if len(os.Args) < 2 { + log.Fatal("Must provide a user handle! Exiting...") + } + handle := scraper.UserHandle(os.Args[1]) + + user, err := scraper.GetUser(handle) + if err != nil { + log.Fatal("Error getting user profile: " + err.Error()) + } + + tweets, retweets, users, err := scraper.GetFeedFull(user.ID, 1) + if err != nil { + log.Fatal("Error getting user feed: " + err.Error()) + } + + display_feed(user, tweets, retweets, users) + + fmt.Printf("Got a total of %d tweets, %d retweets, from %d users\n", len(tweets), len(retweets), len(users)) +} + +func display_feed(user scraper.User, tweets []scraper.Tweet, retweets []scraper.Retweet, users []scraper.User) { + sort.Slice(tweets, func(i, j int) bool { return !tweets[i].PostedAt.Before(tweets[j].PostedAt) }) + tweet_map := make(map[scraper.TweetID]scraper.Tweet) + for _, t := range tweets { + tweet_map[t.ID] = t + } + + sort.Slice(retweets, func(i, j int) bool { return !retweets[i].RetweetedAt.Before(retweets[j].RetweetedAt) }) + users_dict := make(map[scraper.UserID]scraper.User) + for _, u := range users { + users_dict[u.ID] = u + } + + i := 0 + j := 0 + for i < len(tweets) && j < len(retweets) { + if !tweets[i].PostedAt.Before(retweets[j].RetweetedAt) { + tweet := tweets[i] + if tweet.User != user.ID { + i += 1 + continue + } + + user, ok := users_dict[tweet.User] + if !ok { + log.Fatalf("User not found: %q", tweet.User) + } + + print_tweet(tweets[i], user) + i += 1 + } else { + retweet := retweets[j] + if retweet.RetweetedBy != user.ID { + j += 1 + continue + } + tweet, ok := tweet_map[retweet.TweetID] + if !ok { + log.Fatalf("Tweet not found: %q", retweet.TweetID) + } + original_poster, ok := users_dict[tweet.User] + if !ok { + log.Fatalf("User not found: %q", tweet.User) + } + retweeter, ok := users_dict[retweet.RetweetedBy] + if !ok { + log.Fatalf("User not found: %q", retweet.RetweetedBy) + } + print_retweet(retweet, tweet, original_poster, retweeter) + j += 1 + } + } + for i < len(tweets) { + tweet := tweets[i] + if tweet.User != user.ID { + i += 1 + continue + } + + user, ok := users_dict[tweet.User] + if !ok { + log.Fatalf("User not found: %q", tweet.User) + } + + print_tweet(tweets[i], user) + i += 1 + } + for j < len(retweets) { + retweet := retweets[j] + if retweet.RetweetedBy != user.ID { + j += 1 + continue + } + tweet, ok := tweet_map[retweet.TweetID] + if !ok { + log.Fatalf("Tweet not found: %q", retweet.TweetID) + } + original_poster, ok := users_dict[tweet.User] + if !ok { + log.Fatalf("User not found: %q", tweet.User) + } + retweeter, ok := users_dict[retweet.RetweetedBy] + if !ok { + log.Fatalf("User not found: %q", retweet.RetweetedBy) + } + print_retweet(retweet, tweet, original_poster, retweeter) + j += 1 + } +} + +func print_tweet(tweet scraper.Tweet, user scraper.User) { + fmt.Printf("%s => %s\n Replies: %d Retweets: %d Likes: %d\n", user.DisplayName, tweet.Text, tweet.NumReplies, tweet.NumRetweets, tweet.NumLikes) +} + +func print_retweet(retweet scraper.Retweet, original_tweet scraper.Tweet, original_poster scraper.User, retweeter scraper.User) { + fmt.Printf("%s [retweet] %s => %s\n Replies: %d Retweets: %d Likes: %d\n", retweeter.DisplayName, original_poster.DisplayName, original_tweet.Text, original_tweet.NumReplies, original_tweet.NumRetweets, original_tweet.NumLikes) +} diff --git a/cmd/fetch_user_profile_to_stdout.go b/cmd/fetch_user_profile_to_stdout.go new file mode 100644 index 0000000..aa0f8f7 --- /dev/null +++ b/cmd/fetch_user_profile_to_stdout.go @@ -0,0 +1,25 @@ +package main + +import ( + "os" + "fmt" + "offline_twitter/scraper" + "log" +) + +const INCLUDE_REPLIES = true; + +func main() { + if len(os.Args) < 2 { + log.Fatal("Must provide tweet!") + } + + user_handle := os.Args[1] + + user, err := scraper.GetUser(user_handle) + if err != nil { + log.Fatal(err.Error()) + } + + fmt.Printf("%v\n", user) +} diff --git a/cmd/fetch_user_profile_to_stdout/main.go b/cmd/fetch_user_profile_to_stdout/main.go new file mode 100644 index 0000000..5342836 --- /dev/null +++ b/cmd/fetch_user_profile_to_stdout/main.go @@ -0,0 +1,25 @@ +package main + +import ( + "os" + "fmt" + "offline_twitter/scraper" + "log" +) + +// const INCLUDE_REPLIES = true; + +func main() { + if len(os.Args) < 2 { + log.Fatal("Must provide tweet!") + } + + user_handle := scraper.UserHandle(os.Args[1]) + + user, err := scraper.GetUser(user_handle) + if err != nil { + log.Fatal(err.Error()) + } + + fmt.Printf("%v\n", user) +} diff --git a/scraper/api_request_utils.go b/scraper/api_request_utils.go new file mode 100644 index 0000000..c4ff0ce --- /dev/null +++ b/scraper/api_request_utils.go @@ -0,0 +1,220 @@ +package scraper + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "time" +) + +const API_CONVERSATION_BASE_PATH = "https://twitter.com/i/api/2/timeline/conversation/" +const API_USER_TIMELINE_BASE_PATH = "https://api.twitter.com/2/timeline/profile/" + +type API struct{} + +func (api API) GetFeedFor(user_id UserID, cursor string) (TweetResponse, error) { + client := &http.Client{Timeout: 10 * time.Second} + req, err := http.NewRequest("GET", API_USER_TIMELINE_BASE_PATH + string(user_id) + ".json", nil) + if err != nil { + return TweetResponse{}, err + } + + err = ApiRequestAddTokens(req) + if err != nil { + return TweetResponse{}, err + } + + ApiRequestAddAllParams(req) + + if cursor != "" { + UpdateQueryCursor(req, cursor, false) + } + + resp, err := client.Do(req) + if err != nil { + return TweetResponse{}, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + content, _ := ioutil.ReadAll(resp.Body) + return TweetResponse{}, fmt.Errorf("HTTP %s: %s", resp.Status, content) + } + + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return TweetResponse{}, err + } + + var response TweetResponse + err = json.Unmarshal(body, &response) + return response, err +} + +// Resend the request to get more tweets if necessary +func (api API) GetMoreTweets(user_id UserID, response *TweetResponse, max_tweets int) error { + last_response := response + for last_response.GetCursor() != "" && len(response.GlobalObjects.Tweets) < max_tweets { + fresh_response, err := api.GetFeedFor(user_id, last_response.GetCursor()) + if err != nil { + return err + } + + last_response = &fresh_response + + // Copy over the tweets and the users + for id, tweet := range last_response.GlobalObjects.Tweets { + response.GlobalObjects.Tweets[id] = tweet + } + for id, user := range last_response.GlobalObjects.Users { + response.GlobalObjects.Users[id] = user + } + } + return nil +} + + +func (api API) GetTweet(id string, cursor string) (TweetResponse, error) { + client := &http.Client{Timeout: 10 * time.Second} + req, err := http.NewRequest("GET", API_CONVERSATION_BASE_PATH + id + ".json", nil) + if err != nil { + return TweetResponse{}, err + } + + err = ApiRequestAddTokens(req) + if err != nil { + return TweetResponse{}, err + } + + ApiRequestAddAllParams(req) + if cursor != "" { + UpdateQueryCursor(req, cursor, true) + } + + resp, err := client.Do(req) + if err != nil { + return TweetResponse{}, err + } + defer resp.Body.Close() + + if !(resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusForbidden) { + content, _ := ioutil.ReadAll(resp.Body) + return TweetResponse{}, fmt.Errorf("HTTP %d %s: %s", resp.StatusCode, resp.Status, content) + } + + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return TweetResponse{}, err + } + + var response TweetResponse + err = json.Unmarshal(body, &response) + return response, err +} + +// Resend the request to get more replies if necessary +func (api API) GetMoreReplies(tweet_id string, response *TweetResponse, max_replies int) error { + last_response := response + for last_response.GetCursor() != "" && len(response.GlobalObjects.Tweets) < max_replies { + fresh_response, err := api.GetTweet(tweet_id, last_response.GetCursor()) + if err != nil { + return err + } + + last_response = &fresh_response + + // Copy over the tweets and the users + for id, tweet := range last_response.GlobalObjects.Tweets { + response.GlobalObjects.Tweets[id] = tweet + } + for id, user := range last_response.GlobalObjects.Users { + response.GlobalObjects.Users[id] = user + } + } + return nil +} + +func UpdateQueryCursor(req *http.Request, new_cursor string, is_tweet bool) { + query := req.URL.Query() + query.Add("cursor", new_cursor) + if is_tweet { + query.Add("referrer", "tweet") + } + req.URL.RawQuery = query.Encode() +} + + +func (api API) GetUser(handle UserHandle) (APIUser, error) { + client := &http.Client{Timeout: 10 * time.Second} + req, err := http.NewRequest("GET", "https://api.twitter.com/graphql/4S2ihIKfF3xhp-ENxvUAfQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22" + string(handle) + "%22%2C%22withHighlightedLabel%22%3Atrue%7D", nil) + if err != nil { + return APIUser{}, err + } + err = ApiRequestAddTokens(req) + if err != nil { + return APIUser{}, err + } + + resp, err := client.Do(req) + if err != nil { + return APIUser{}, err + } + defer resp.Body.Close() + + if !(resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusForbidden) { + content, _ := ioutil.ReadAll(resp.Body) + return APIUser{}, fmt.Errorf("response status %s: %s", resp.Status, content) + } + + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return APIUser{}, err + } + + var response UserResponse + err = json.Unmarshal(body, &response) + return response.ConvertToAPIUser(), err +} + +// Add Bearer token and guest token +func ApiRequestAddTokens(req *http.Request) error { + req.Header.Set("Authorization", "Bearer " + BEARER_TOKEN) + + guestToken, err := GetGuestToken() + if err != nil { + return err + } + req.Header.Set("X-Guest-Token", guestToken) + return nil +} + +// Add the query params to get all data +func ApiRequestAddAllParams(req *http.Request) { + query := req.URL.Query() + query.Add("include_profile_interstitial_type", "1") + query.Add("include_blocking", "1") + query.Add("include_blocked_by", "1") + query.Add("include_followed_by", "1") + query.Add("include_want_retweets", "1") + query.Add("include_mute_edge", "1") + query.Add("include_can_dm", "1") + query.Add("include_can_media_tag", "1") + query.Add("skip_status", "1") + query.Add("cards_platform", "Web-12") + query.Add("include_cards", "1") + query.Add("include_ext_alt_text", "true") + query.Add("include_quote_count", "true") + query.Add("include_reply_count", "1") + query.Add("tweet_mode", "extended") + query.Add("include_entities", "true") + query.Add("include_user_entities", "true") + query.Add("include_ext_media_color", "true") + query.Add("include_ext_media_availability", "true") + query.Add("send_error_codes", "true") + query.Add("simple_quoted_tweet", "true") + query.Add("include_tweet_replies", "true") + query.Add("ext", "mediaStats,highlightedLabel") + query.Add("count", "20") + req.URL.RawQuery = query.Encode() +} diff --git a/scraper/tweet.go b/scraper/tweet.go index 73e9916..fc6ffac 100644 --- a/scraper/tweet.go +++ b/scraper/tweet.go @@ -5,6 +5,7 @@ import ( "fmt" ) +const DEFAULT_MAX_REPLIES_EAGER_LOAD = 50 type TweetID string @@ -75,6 +76,43 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { } +// Return a single tweet, nothing else +func GetTweet(id string) (Tweet, error) { + api := API{} + tweet_response, err := api.GetTweet(id, "") + if err != nil { + return Tweet{}, err + } + + single_tweet, ok := tweet_response.GlobalObjects.Tweets[id] + + if !ok { + return Tweet{}, fmt.Errorf("Didn't get the tweet!\n%v", tweet_response) + } + + return ParseSingleTweet(single_tweet) +} + + +// Return a list of tweets, including the original and the rest of its thread, +// along with a list of associated users +func GetTweetFull(id string) (tweets []Tweet, retweets []Retweet, users []User, err error) { + api := API{} + tweet_response, err := api.GetTweet(id, "") + if err != nil { + return + } + if len(tweet_response.GlobalObjects.Tweets) < DEFAULT_MAX_REPLIES_EAGER_LOAD && + tweet_response.GetCursor() != "" { + err = api.GetMoreReplies(id, &tweet_response, DEFAULT_MAX_REPLIES_EAGER_LOAD) + if err != nil { + return + } + } + + return ParseTweetResponse(tweet_response) +} + func ParseTweetResponse(resp TweetResponse) (tweets []Tweet, retweets []Retweet, users []User, err error) { var new_tweet Tweet var new_retweet Retweet diff --git a/scraper/user.go b/scraper/user.go index d5c6c71..1867d41 100644 --- a/scraper/user.go +++ b/scraper/user.go @@ -8,6 +8,14 @@ import ( type UserID string type UserHandle string +func UIDArrayToStrArray(uids []UserID) []string { + ret := []string{} + for _, uid := range uids { + ret = append(ret, string(uid)) + } + return ret +} + type User struct { ID UserID DisplayName string @@ -54,3 +62,13 @@ func ParseSingleUser(apiUser APIUser) (ret User, err error) { } return } + +// Calls API#GetUser and returns the parsed result +func GetUser(handle UserHandle) (User, error) { + api := API{} + apiUser, err := api.GetUser(handle) + if err != nil { + return User{}, err + } + return ParseSingleUser(apiUser) +} diff --git a/scraper/user_feed.go b/scraper/user_feed.go new file mode 100644 index 0000000..d323fcc --- /dev/null +++ b/scraper/user_feed.go @@ -0,0 +1,22 @@ +package scraper + + +// Return a list of tweets, including the original and the rest of its thread, +// along with a list of associated users +func GetFeedFull(user_id UserID, max_tweets int) (tweets []Tweet, retweets []Retweet, users []User, err error) { + api := API{} + tweet_response, err := api.GetFeedFor(user_id, "") + if err != nil { + return + } + + if len(tweet_response.GlobalObjects.Tweets) < max_tweets && + tweet_response.GetCursor() != "" { + err = api.GetMoreTweets(user_id, &tweet_response, max_tweets) + if err != nil { + return + } + } + + return ParseTweetResponse(tweet_response) +}