Add actual scraping

This commit is contained in:
Alessio 2021-06-16 19:31:27 -07:00
parent f28d600024
commit 28b11548bf
11 changed files with 689 additions and 0 deletions

View File

@ -43,5 +43,8 @@ tasks:
cd twitter_offline_engine/scraper
golangci-lint run
cd ../cmd
golangci-lint run
duration=$SECONDS
echo "Task completed in $(($duration / 60))m$(($duration % 60))s."

View File

@ -0,0 +1,57 @@
package main
import (
"os"
"fmt"
"offline_twitter/scraper"
// "time"
"log"
"strings"
)
const INCLUDE_REPLIES = true;
// input: e.g., "https://twitter.com/michaelmalice/status/1395882872729477131"
func parse_tweet(url string) (string, error) {
parts := strings.Split(url, "/")
if len(parts) != 6 {
return "", fmt.Errorf("Tweet format isn't right (%d)", len(parts))
}
if parts[0] != "https:" || parts[1] != "" || parts[2] != "twitter.com" || parts[4] != "status" {
return "", fmt.Errorf("Tweet format isn't right")
}
return parts[5], nil
}
func main() {
if len(os.Args) < 2 {
log.Fatal("Must provide tweet! Exiting...")
}
tweet_id, err := parse_tweet(os.Args[1])
if err != nil {
log.Fatal(err.Error())
}
if INCLUDE_REPLIES {
tweets, retweets, users, err := scraper.GetTweetFull(tweet_id)
if err != nil {
log.Fatal(err.Error())
}
for _, t := range tweets {
fmt.Printf("%v\n", t)
}
for _, t := range retweets {
fmt.Printf("%v\n", t)
}
for _, u := range users {
fmt.Printf("%v\n", u)
}
} else {
tweet, err := scraper.GetTweet(tweet_id)
if err != nil {
log.Fatal(err.Error())
}
fmt.Printf("%v\n", tweet)
}
}

View File

@ -0,0 +1,57 @@
package main
import (
"os"
"fmt"
"offline_twitter/scraper"
// "time"
"log"
"strings"
)
const INCLUDE_REPLIES = true;
// input: e.g., "https://twitter.com/michaelmalice/status/1395882872729477131"
func parse_tweet(url string) (string, error) {
parts := strings.Split(url, "/")
if len(parts) != 6 {
return "", fmt.Errorf("Tweet format isn't right (%d)", len(parts))
}
if parts[0] != "https:" || parts[1] != "" || parts[2] != "twitter.com" || parts[4] != "status" {
return "", fmt.Errorf("Tweet format isn't right")
}
return parts[5], nil
}
func main() {
if len(os.Args) < 2 {
log.Fatal("Must provide tweet! Exiting...")
}
tweet_id, err := parse_tweet(os.Args[1])
if err != nil {
log.Fatal(err.Error())
}
if INCLUDE_REPLIES {
tweets, retweets, users, err := scraper.GetTweetFull(tweet_id)
if err != nil {
log.Fatal(err.Error())
}
for _, t := range tweets {
fmt.Printf("%v\n", t)
}
for _, t := range retweets {
fmt.Printf("%v\n", t)
}
for _, u := range users {
fmt.Printf("%v\n", u)
}
} else {
tweet, err := scraper.GetTweet(tweet_id)
if err != nil {
log.Fatal(err.Error())
}
fmt.Printf("%v\n", tweet)
}
}

View File

@ -0,0 +1,96 @@
package main
import (
"os"
"fmt"
"offline_twitter/scraper"
"log"
"sort"
)
func main() {
if len(os.Args) < 2 {
log.Fatal("Must provide a user handle! Exiting...")
}
handle := scraper.UserHandle(os.Args[1])
user, err := scraper.GetUser(handle)
if err != nil {
log.Fatal("Error getting user profile: " + err.Error())
}
tweets, retweets, users, err := scraper.GetFeedFull(user.ID, 1)
if err != nil {
log.Fatal("Error getting user feed: " + err.Error())
}
display_feed(user, tweets, retweets, users)
fmt.Printf("Got a total of %d tweets, %d retweets, from %d users\n", len(tweets), len(retweets), len(users))
}
func display_feed(user scraper.User, tweets []scraper.Tweet, retweets []scraper.Retweet, users []scraper.User) {
sort.Slice(tweets, func(i, j int) bool { return !tweets[i].PostedAt.Before(tweets[j].PostedAt) })
tweet_map := make(map[scraper.TweetID]scraper.Tweet)
for _, t := range tweets {
tweet_map[t.ID] = t
}
sort.Slice(retweets, func(i, j int) bool { return !retweets[i].RetweetedAt.Before(retweets[j].RetweetedAt) })
users_dict := make(map[scraper.UserID]scraper.User)
for _, u := range users {
users_dict[u.ID] = u
}
i := 0
j := 0
for {
if i < len(tweets) && j < len(retweets) {
if !tweets[i].PostedAt.Before(retweets[j].RetweetedAt) {
tweet := tweets[i]
if tweet.User != user.ID {
i += 1
continue
}
user, ok := users_dict[tweet.User]
if !ok {
log.Fatalf("User not found: %q", tweet.User)
}
print_tweet(tweets[i], user)
i += 1
} else {
retweet := retweets[j]
if retweet.RetweetedBy != user.ID {
j += 1
continue
}
tweet, ok := tweet_map[retweet.TweetID]
if !ok {
log.Fatalf("Tweet not found: %q", retweet.TweetID)
}
original_poster, ok := users_dict[tweet.User]
if !ok {
log.Fatalf("User not found: %q", tweet.User)
}
retweeter, ok := users_dict[retweet.RetweetedBy]
if !ok {
log.Fatalf("User not found: %q", retweet.RetweetedBy)
}
print_retweet(retweet, tweet, original_poster, retweeter)
j += 1
}
} else {
break
}
}
}
func print_tweet(tweet scraper.Tweet, user scraper.User) {
fmt.Printf("%s => %s\n Replies: %d Retweets: %d Likes: %d\n", user.DisplayName, tweet.Text, tweet.NumReplies, tweet.NumRetweets, tweet.NumLikes)
}
func print_retweet(retweet scraper.Retweet, original_tweet scraper.Tweet, original_poster scraper.User, retweeter scraper.User) {
fmt.Printf("%s [retweet] %s => %s\n Replies: %d Retweets: %d Likes: %d\n", retweeter.DisplayName, original_poster.DisplayName, original_tweet.Text, original_tweet.NumReplies, original_tweet.NumRetweets, original_tweet.NumLikes)
}

View File

@ -0,0 +1,128 @@
package main
import (
"os"
"fmt"
"offline_twitter/scraper"
"log"
"sort"
)
func main() {
if len(os.Args) < 2 {
log.Fatal("Must provide a user handle! Exiting...")
}
handle := scraper.UserHandle(os.Args[1])
user, err := scraper.GetUser(handle)
if err != nil {
log.Fatal("Error getting user profile: " + err.Error())
}
tweets, retweets, users, err := scraper.GetFeedFull(user.ID, 1)
if err != nil {
log.Fatal("Error getting user feed: " + err.Error())
}
display_feed(user, tweets, retweets, users)
fmt.Printf("Got a total of %d tweets, %d retweets, from %d users\n", len(tweets), len(retweets), len(users))
}
func display_feed(user scraper.User, tweets []scraper.Tweet, retweets []scraper.Retweet, users []scraper.User) {
sort.Slice(tweets, func(i, j int) bool { return !tweets[i].PostedAt.Before(tweets[j].PostedAt) })
tweet_map := make(map[scraper.TweetID]scraper.Tweet)
for _, t := range tweets {
tweet_map[t.ID] = t
}
sort.Slice(retweets, func(i, j int) bool { return !retweets[i].RetweetedAt.Before(retweets[j].RetweetedAt) })
users_dict := make(map[scraper.UserID]scraper.User)
for _, u := range users {
users_dict[u.ID] = u
}
i := 0
j := 0
for i < len(tweets) && j < len(retweets) {
if !tweets[i].PostedAt.Before(retweets[j].RetweetedAt) {
tweet := tweets[i]
if tweet.User != user.ID {
i += 1
continue
}
user, ok := users_dict[tweet.User]
if !ok {
log.Fatalf("User not found: %q", tweet.User)
}
print_tweet(tweets[i], user)
i += 1
} else {
retweet := retweets[j]
if retweet.RetweetedBy != user.ID {
j += 1
continue
}
tweet, ok := tweet_map[retweet.TweetID]
if !ok {
log.Fatalf("Tweet not found: %q", retweet.TweetID)
}
original_poster, ok := users_dict[tweet.User]
if !ok {
log.Fatalf("User not found: %q", tweet.User)
}
retweeter, ok := users_dict[retweet.RetweetedBy]
if !ok {
log.Fatalf("User not found: %q", retweet.RetweetedBy)
}
print_retweet(retweet, tweet, original_poster, retweeter)
j += 1
}
}
for i < len(tweets) {
tweet := tweets[i]
if tweet.User != user.ID {
i += 1
continue
}
user, ok := users_dict[tweet.User]
if !ok {
log.Fatalf("User not found: %q", tweet.User)
}
print_tweet(tweets[i], user)
i += 1
}
for j < len(retweets) {
retweet := retweets[j]
if retweet.RetweetedBy != user.ID {
j += 1
continue
}
tweet, ok := tweet_map[retweet.TweetID]
if !ok {
log.Fatalf("Tweet not found: %q", retweet.TweetID)
}
original_poster, ok := users_dict[tweet.User]
if !ok {
log.Fatalf("User not found: %q", tweet.User)
}
retweeter, ok := users_dict[retweet.RetweetedBy]
if !ok {
log.Fatalf("User not found: %q", retweet.RetweetedBy)
}
print_retweet(retweet, tweet, original_poster, retweeter)
j += 1
}
}
func print_tweet(tweet scraper.Tweet, user scraper.User) {
fmt.Printf("%s => %s\n Replies: %d Retweets: %d Likes: %d\n", user.DisplayName, tweet.Text, tweet.NumReplies, tweet.NumRetweets, tweet.NumLikes)
}
func print_retweet(retweet scraper.Retweet, original_tweet scraper.Tweet, original_poster scraper.User, retweeter scraper.User) {
fmt.Printf("%s [retweet] %s => %s\n Replies: %d Retweets: %d Likes: %d\n", retweeter.DisplayName, original_poster.DisplayName, original_tweet.Text, original_tweet.NumReplies, original_tweet.NumRetweets, original_tweet.NumLikes)
}

View File

@ -0,0 +1,25 @@
package main
import (
"os"
"fmt"
"offline_twitter/scraper"
"log"
)
const INCLUDE_REPLIES = true;
func main() {
if len(os.Args) < 2 {
log.Fatal("Must provide tweet!")
}
user_handle := os.Args[1]
user, err := scraper.GetUser(user_handle)
if err != nil {
log.Fatal(err.Error())
}
fmt.Printf("%v\n", user)
}

View File

@ -0,0 +1,25 @@
package main
import (
"os"
"fmt"
"offline_twitter/scraper"
"log"
)
// const INCLUDE_REPLIES = true;
func main() {
if len(os.Args) < 2 {
log.Fatal("Must provide tweet!")
}
user_handle := scraper.UserHandle(os.Args[1])
user, err := scraper.GetUser(user_handle)
if err != nil {
log.Fatal(err.Error())
}
fmt.Printf("%v\n", user)
}

View File

@ -0,0 +1,220 @@
package scraper
import (
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"time"
)
const API_CONVERSATION_BASE_PATH = "https://twitter.com/i/api/2/timeline/conversation/"
const API_USER_TIMELINE_BASE_PATH = "https://api.twitter.com/2/timeline/profile/"
type API struct{}
func (api API) GetFeedFor(user_id UserID, cursor string) (TweetResponse, error) {
client := &http.Client{Timeout: 10 * time.Second}
req, err := http.NewRequest("GET", API_USER_TIMELINE_BASE_PATH + string(user_id) + ".json", nil)
if err != nil {
return TweetResponse{}, err
}
err = ApiRequestAddTokens(req)
if err != nil {
return TweetResponse{}, err
}
ApiRequestAddAllParams(req)
if cursor != "" {
UpdateQueryCursor(req, cursor, false)
}
resp, err := client.Do(req)
if err != nil {
return TweetResponse{}, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
content, _ := ioutil.ReadAll(resp.Body)
return TweetResponse{}, fmt.Errorf("HTTP %s: %s", resp.Status, content)
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return TweetResponse{}, err
}
var response TweetResponse
err = json.Unmarshal(body, &response)
return response, err
}
// Resend the request to get more tweets if necessary
func (api API) GetMoreTweets(user_id UserID, response *TweetResponse, max_tweets int) error {
last_response := response
for last_response.GetCursor() != "" && len(response.GlobalObjects.Tweets) < max_tweets {
fresh_response, err := api.GetFeedFor(user_id, last_response.GetCursor())
if err != nil {
return err
}
last_response = &fresh_response
// Copy over the tweets and the users
for id, tweet := range last_response.GlobalObjects.Tweets {
response.GlobalObjects.Tweets[id] = tweet
}
for id, user := range last_response.GlobalObjects.Users {
response.GlobalObjects.Users[id] = user
}
}
return nil
}
func (api API) GetTweet(id string, cursor string) (TweetResponse, error) {
client := &http.Client{Timeout: 10 * time.Second}
req, err := http.NewRequest("GET", API_CONVERSATION_BASE_PATH + id + ".json", nil)
if err != nil {
return TweetResponse{}, err
}
err = ApiRequestAddTokens(req)
if err != nil {
return TweetResponse{}, err
}
ApiRequestAddAllParams(req)
if cursor != "" {
UpdateQueryCursor(req, cursor, true)
}
resp, err := client.Do(req)
if err != nil {
return TweetResponse{}, err
}
defer resp.Body.Close()
if !(resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusForbidden) {
content, _ := ioutil.ReadAll(resp.Body)
return TweetResponse{}, fmt.Errorf("HTTP %d %s: %s", resp.StatusCode, resp.Status, content)
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return TweetResponse{}, err
}
var response TweetResponse
err = json.Unmarshal(body, &response)
return response, err
}
// Resend the request to get more replies if necessary
func (api API) GetMoreReplies(tweet_id string, response *TweetResponse, max_replies int) error {
last_response := response
for last_response.GetCursor() != "" && len(response.GlobalObjects.Tweets) < max_replies {
fresh_response, err := api.GetTweet(tweet_id, last_response.GetCursor())
if err != nil {
return err
}
last_response = &fresh_response
// Copy over the tweets and the users
for id, tweet := range last_response.GlobalObjects.Tweets {
response.GlobalObjects.Tweets[id] = tweet
}
for id, user := range last_response.GlobalObjects.Users {
response.GlobalObjects.Users[id] = user
}
}
return nil
}
func UpdateQueryCursor(req *http.Request, new_cursor string, is_tweet bool) {
query := req.URL.Query()
query.Add("cursor", new_cursor)
if is_tweet {
query.Add("referrer", "tweet")
}
req.URL.RawQuery = query.Encode()
}
func (api API) GetUser(handle UserHandle) (APIUser, error) {
client := &http.Client{Timeout: 10 * time.Second}
req, err := http.NewRequest("GET", "https://api.twitter.com/graphql/4S2ihIKfF3xhp-ENxvUAfQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22" + string(handle) + "%22%2C%22withHighlightedLabel%22%3Atrue%7D", nil)
if err != nil {
return APIUser{}, err
}
err = ApiRequestAddTokens(req)
if err != nil {
return APIUser{}, err
}
resp, err := client.Do(req)
if err != nil {
return APIUser{}, err
}
defer resp.Body.Close()
if !(resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusForbidden) {
content, _ := ioutil.ReadAll(resp.Body)
return APIUser{}, fmt.Errorf("response status %s: %s", resp.Status, content)
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return APIUser{}, err
}
var response UserResponse
err = json.Unmarshal(body, &response)
return response.ConvertToAPIUser(), err
}
// Add Bearer token and guest token
func ApiRequestAddTokens(req *http.Request) error {
req.Header.Set("Authorization", "Bearer " + BEARER_TOKEN)
guestToken, err := GetGuestToken()
if err != nil {
return err
}
req.Header.Set("X-Guest-Token", guestToken)
return nil
}
// Add the query params to get all data
func ApiRequestAddAllParams(req *http.Request) {
query := req.URL.Query()
query.Add("include_profile_interstitial_type", "1")
query.Add("include_blocking", "1")
query.Add("include_blocked_by", "1")
query.Add("include_followed_by", "1")
query.Add("include_want_retweets", "1")
query.Add("include_mute_edge", "1")
query.Add("include_can_dm", "1")
query.Add("include_can_media_tag", "1")
query.Add("skip_status", "1")
query.Add("cards_platform", "Web-12")
query.Add("include_cards", "1")
query.Add("include_ext_alt_text", "true")
query.Add("include_quote_count", "true")
query.Add("include_reply_count", "1")
query.Add("tweet_mode", "extended")
query.Add("include_entities", "true")
query.Add("include_user_entities", "true")
query.Add("include_ext_media_color", "true")
query.Add("include_ext_media_availability", "true")
query.Add("send_error_codes", "true")
query.Add("simple_quoted_tweet", "true")
query.Add("include_tweet_replies", "true")
query.Add("ext", "mediaStats,highlightedLabel")
query.Add("count", "20")
req.URL.RawQuery = query.Encode()
}

View File

@ -5,6 +5,7 @@ import (
"fmt"
)
const DEFAULT_MAX_REPLIES_EAGER_LOAD = 50
type TweetID string
@ -75,6 +76,43 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
}
// Return a single tweet, nothing else
func GetTweet(id string) (Tweet, error) {
api := API{}
tweet_response, err := api.GetTweet(id, "")
if err != nil {
return Tweet{}, err
}
single_tweet, ok := tweet_response.GlobalObjects.Tweets[id]
if !ok {
return Tweet{}, fmt.Errorf("Didn't get the tweet!\n%v", tweet_response)
}
return ParseSingleTweet(single_tweet)
}
// Return a list of tweets, including the original and the rest of its thread,
// along with a list of associated users
func GetTweetFull(id string) (tweets []Tweet, retweets []Retweet, users []User, err error) {
api := API{}
tweet_response, err := api.GetTweet(id, "")
if err != nil {
return
}
if len(tweet_response.GlobalObjects.Tweets) < DEFAULT_MAX_REPLIES_EAGER_LOAD &&
tweet_response.GetCursor() != "" {
err = api.GetMoreReplies(id, &tweet_response, DEFAULT_MAX_REPLIES_EAGER_LOAD)
if err != nil {
return
}
}
return ParseTweetResponse(tweet_response)
}
func ParseTweetResponse(resp TweetResponse) (tweets []Tweet, retweets []Retweet, users []User, err error) {
var new_tweet Tweet
var new_retweet Retweet

View File

@ -8,6 +8,14 @@ import (
type UserID string
type UserHandle string
func UIDArrayToStrArray(uids []UserID) []string {
ret := []string{}
for _, uid := range uids {
ret = append(ret, string(uid))
}
return ret
}
type User struct {
ID UserID
DisplayName string
@ -54,3 +62,13 @@ func ParseSingleUser(apiUser APIUser) (ret User, err error) {
}
return
}
// Calls API#GetUser and returns the parsed result
func GetUser(handle UserHandle) (User, error) {
api := API{}
apiUser, err := api.GetUser(handle)
if err != nil {
return User{}, err
}
return ParseSingleUser(apiUser)
}

22
scraper/user_feed.go Normal file
View File

@ -0,0 +1,22 @@
package scraper
// Return a list of tweets, including the original and the rest of its thread,
// along with a list of associated users
func GetFeedFull(user_id UserID, max_tweets int) (tweets []Tweet, retweets []Retweet, users []User, err error) {
api := API{}
tweet_response, err := api.GetFeedFor(user_id, "")
if err != nil {
return
}
if len(tweet_response.GlobalObjects.Tweets) < max_tweets &&
tweet_response.GetCursor() != "" {
err = api.GetMoreTweets(user_id, &tweet_response, max_tweets)
if err != nil {
return
}
}
return ParseTweetResponse(tweet_response)
}