package scraper import ( "encoding/json" "fmt" "io" "net/http" "net/url" "time" log "github.com/sirupsen/logrus" ) const API_CONVERSATION_BASE_PATH = "https://twitter.com/i/api/2/timeline/conversation/" const API_USER_TIMELINE_BASE_PATH = "https://api.twitter.com/2/timeline/profile/" type API struct{} func (api API) GetFeedFor(user_id UserID, cursor string) (TweetResponse, error) { client := &http.Client{Timeout: 10 * time.Second} req, err := http.NewRequest("GET", fmt.Sprintf("%s%d.json", API_USER_TIMELINE_BASE_PATH, user_id), nil) if err != nil { return TweetResponse{}, fmt.Errorf("Error initializing HTTP request for GetFeedFor(%d):\n %w", user_id, err) } err = ApiRequestAddTokens(req) if err != nil { return TweetResponse{}, fmt.Errorf("Error adding tokens to HTTP request:\n %w", err) } ApiRequestAddAllParams(req) if cursor != "" { UpdateQueryCursor(req, cursor, false) } resp, err := client.Do(req) if err != nil { return TweetResponse{}, fmt.Errorf("Error executing HTTP request for GetFeedFor(%d):\n %w", user_id, err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { content, err := io.ReadAll(resp.Body) if err != nil { panic(err) } s := "" for header := range resp.Header { s += fmt.Sprintf(" %s: %s\n", header, resp.Header.Get(header)) } return TweetResponse{}, fmt.Errorf("HTTP %s\n%s\n%s", resp.Status, s, content) } body, err := io.ReadAll(resp.Body) if err != nil { return TweetResponse{}, fmt.Errorf("Error reading response body for GetUserFeedFor(%d):\n %w", user_id, err) } log.Debug(string(body)) var response TweetResponse err = json.Unmarshal(body, &response) if err != nil { return response, fmt.Errorf("Error parsing API response for GetUserFeedFor(%d):\n %w", user_id, err) } return response, nil } /** * Resend the request to get more tweets if necessary * * args: * - user_id: the user's UserID * - response: an "out" parameter; the TweetResponse that tweets, RTs and users will be appended to * - min_tweets: the desired minimum amount of tweets to get */ func (api API) GetMoreTweetsFromFeed(user_id UserID, response *TweetResponse, min_tweets int) error { // TODO user-feed-infinite-fetch: what if you reach the end of the user's timeline? Might loop // forever getting no new tweets last_response := response for last_response.GetCursor() != "" && len(response.GlobalObjects.Tweets) < min_tweets { fresh_response, err := api.GetFeedFor(user_id, last_response.GetCursor()) if err != nil { return err } if fresh_response.GetCursor() == last_response.GetCursor() && len(fresh_response.GlobalObjects.Tweets) == 0 { // Empty response, cursor same as previous: end of feed has been reached return END_OF_FEED } if fresh_response.IsEndOfFeed() { // Response has a pinned tweet, but no other content: end of feed has been reached return END_OF_FEED } last_response = &fresh_response // Copy over the tweets and the users for id, tweet := range last_response.GlobalObjects.Tweets { response.GlobalObjects.Tweets[id] = tweet } for id, user := range last_response.GlobalObjects.Users { response.GlobalObjects.Users[id] = user } fmt.Printf("Have %d tweets, and %d users so far\n", len(response.GlobalObjects.Tweets), len(response.GlobalObjects.Users)) } return nil } func (api API) GetTweet(id TweetID, cursor string) (TweetResponse, error) { client := &http.Client{Timeout: 10 * time.Second} req, err := http.NewRequest("GET", fmt.Sprintf("%s%d.json", API_CONVERSATION_BASE_PATH, id), nil) if err != nil { return TweetResponse{}, fmt.Errorf("Error initializing HTTP request:\n %w", err) } err = ApiRequestAddTokens(req) if err != nil { return TweetResponse{}, fmt.Errorf("Error adding tokens to HTTP request:\n %w", err) } ApiRequestAddAllParams(req) if cursor != "" { UpdateQueryCursor(req, cursor, true) } resp, err := client.Do(req) if err != nil { return TweetResponse{}, fmt.Errorf("Error executing HTTP request:\n %w", err) } defer resp.Body.Close() if !(resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusForbidden) { content, err := io.ReadAll(resp.Body) if err != nil { panic(err) } return TweetResponse{}, fmt.Errorf("Error getting %q. HTTP %s: %s", req.URL, resp.Status, content) } body, err := io.ReadAll(resp.Body) if err != nil { return TweetResponse{}, fmt.Errorf("Error reading HTTP request:\n %w", err) } log.Debug(string(body)) var response TweetResponse err = json.Unmarshal(body, &response) if err != nil { return response, fmt.Errorf("Error parsing API response for GetTweet(%d):\n %w", id, err) } return response, nil } // Resend the request to get more replies if necessary func (api API) GetMoreReplies(tweet_id TweetID, response *TweetResponse, max_replies int) error { last_response := response for last_response.GetCursor() != "" && len(response.GlobalObjects.Tweets) < max_replies { fresh_response, err := api.GetTweet(tweet_id, last_response.GetCursor()) if err != nil { return err } last_response = &fresh_response // Copy over the tweets and the users for id, tweet := range last_response.GlobalObjects.Tweets { response.GlobalObjects.Tweets[id] = tweet } for id, user := range last_response.GlobalObjects.Users { response.GlobalObjects.Users[id] = user } } return nil } func UpdateQueryCursor(req *http.Request, new_cursor string, is_tweet bool) { query := req.URL.Query() query.Add("cursor", new_cursor) if is_tweet { query.Add("referrer", "tweet") } req.URL.RawQuery = query.Encode() } func (api API) GetUser(handle UserHandle) (APIUser, error) { client := &http.Client{Timeout: 10 * time.Second} req, err := http.NewRequest( "GET", "https://api.twitter.com/graphql/4S2ihIKfF3xhp-ENxvUAfQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22"+string(handle)+ "%22%2C%22withHighlightedLabel%22%3Atrue%7D", nil) if err != nil { return APIUser{}, fmt.Errorf("Error initializing HTTP request:\n %w", err) } err = ApiRequestAddTokens(req) if err != nil { return APIUser{}, fmt.Errorf("Error adding tokens to HTTP request:\n %w", err) } var response UserResponse for retries := 0; retries < 3; retries += 1 { resp, err := client.Do(req) if err != nil { return APIUser{}, fmt.Errorf("Error executing HTTP request for GetUser(%s):\n %w", handle, err) } defer resp.Body.Close() // Sometimes it randomly gives 403 Forbidden. API's fault, not ours // We check for this below if !(resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusForbidden) { content, err := io.ReadAll(resp.Body) if err != nil { panic(err) } return APIUser{}, fmt.Errorf("response status %s: %s", resp.Status, content) } body, err := io.ReadAll(resp.Body) if err != nil { return APIUser{}, fmt.Errorf("Error retrieving API response to GetUser(%s):\n %w", handle, err) } log.Debug(string(body)) err = json.Unmarshal(body, &response) if err != nil { return APIUser{}, fmt.Errorf("Error parsing API response to GetUser(%s):\n %w", handle, err) } // Retry ONLY if the error is code 50 (random authentication failure), NOT on real errors if len(response.Errors) == 1 && response.Errors[0].Code == 50 && response.Errors[0].Name != "NotFoundError" { // Reset the response (remove the Errors) response = UserResponse{} continue } else { // Do not retry on real errors break } } return response.ConvertToAPIUser(), err } func (api API) Search(query string, cursor string) (TweetResponse, error) { client := &http.Client{Timeout: 10 * time.Second} req, err := http.NewRequest( "GET", "https://twitter.com/i/api/2/search/adaptive.json?count=50&spelling_corrections=1&query_source=typed_query&pc=1&q="+ url.QueryEscape(query), nil) if err != nil { return TweetResponse{}, fmt.Errorf("Error initializing HTTP request:\n %w", err) } err = ApiRequestAddTokens(req) if err != nil { return TweetResponse{}, fmt.Errorf("Error adding tokens to HTTP request:\n %w", err) } ApiRequestAddAllParams(req) if cursor != "" { UpdateQueryCursor(req, cursor, false) } fmt.Println(req.URL.String()) resp, err := client.Do(req) if err != nil { return TweetResponse{}, fmt.Errorf("Error executing HTTP request for Search(%q):\n %w", query, err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { content, err := io.ReadAll(resp.Body) if err != nil { panic(err) } return TweetResponse{}, fmt.Errorf("Error while searching for %q. HTTP %s: %s", req.URL, resp.Status, content) } body, err := io.ReadAll(resp.Body) if err != nil { return TweetResponse{}, fmt.Errorf("Error retrieving API response for Search(%q):\n %w", query, err) } // fmt.Println(string(body)) var response TweetResponse err = json.Unmarshal(body, &response) if err != nil { return response, fmt.Errorf("Error parsing API response to Search(%q):\n %w", query, err) } return response, nil } func (api API) GetMoreTweetsFromSearch(query string, response *TweetResponse, max_results int) error { last_response := response for last_response.GetCursor() != "" && len(response.GlobalObjects.Tweets) < max_results { fresh_response, err := api.Search(query, last_response.GetCursor()) if err != nil { return err } if fresh_response.GetCursor() == last_response.GetCursor() && len(fresh_response.GlobalObjects.Tweets) == 0 { // Empty response, cursor same as previous: end of feed has been reached return END_OF_FEED } last_response = &fresh_response // Copy the results over for id, tweet := range last_response.GlobalObjects.Tweets { response.GlobalObjects.Tweets[id] = tweet } for id, user := range last_response.GlobalObjects.Users { response.GlobalObjects.Users[id] = user } fmt.Printf("Have %d tweets\n", len(response.GlobalObjects.Tweets)) // fmt.Printf("Cursor: %s\n", last_response.GetCursor()) } return nil } // Add Bearer token and guest token func ApiRequestAddTokens(req *http.Request) error { req.Header.Set("Authorization", "Bearer "+BEARER_TOKEN) req.Header.Set("x-twitter-client-language", "en") guestToken, err := GetGuestToken() if err != nil { return err } req.Header.Set("X-Guest-Token", guestToken) return nil } // Add the query params to get all data func ApiRequestAddAllParams(req *http.Request) { query := req.URL.Query() query.Add("include_profile_interstitial_type", "1") query.Add("include_blocking", "1") query.Add("include_blocked_by", "1") query.Add("include_followed_by", "1") query.Add("include_want_retweets", "1") query.Add("include_mute_edge", "1") query.Add("include_can_dm", "1") query.Add("include_can_media_tag", "1") query.Add("skip_status", "1") query.Add("cards_platform", "Web-12") query.Add("include_cards", "1") query.Add("include_ext_alt_text", "true") query.Add("include_quote_count", "true") query.Add("include_reply_count", "1") query.Add("tweet_mode", "extended") query.Add("include_entities", "true") query.Add("include_user_entities", "true") query.Add("include_ext_media_availability", "true") query.Add("send_error_codes", "true") query.Add("simple_quoted_tweet", "true") query.Add("include_tweet_replies", "true") query.Add("ext", "mediaStats,highlightedLabel") query.Add("count", "20") req.URL.RawQuery = query.Encode() }