
- ensure all scraper functions have a `api.XYZ` version and a package-level convenience function - isolate `the_api` to top-level convenience functions, in preparation for removal - move a bunch of scraper functions around to be nearby their related functions - new ErrLoginRequired - remove obsolete APIv1 stuff (Feed, TweetDetail) - rename scraper function GetUserFeedGraphqlFor => GetUserFeed - fix go.mod Go version incorrectly claiming it's compatible with Go 1.16 (should be Go 1.17)
392 lines
10 KiB
Go
392 lines
10 KiB
Go
package scraper
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net"
|
|
"net/http"
|
|
"net/http/cookiejar"
|
|
"net/url"
|
|
"strings"
|
|
"time"
|
|
|
|
log "github.com/sirupsen/logrus"
|
|
)
|
|
|
|
type API struct {
|
|
UserHandle UserHandle
|
|
UserID UserID
|
|
IsAuthenticated bool
|
|
GuestToken string
|
|
Client http.Client
|
|
CSRFToken string
|
|
}
|
|
|
|
// Use a global API variable since it is needed in so many utility functions (e.g.,
|
|
// tweet_trove.FillSpaceDetails, tweet_trove.FetchTombstoneUsers, etc.); this avoids having
|
|
// to inject it everywhere.
|
|
//
|
|
// Should be set by the caller (main program) depending on the session file used.
|
|
var the_api API
|
|
|
|
// Initializer for the global api variable
|
|
func InitApi(newApi API) {
|
|
the_api = newApi
|
|
}
|
|
|
|
type api_outstruct struct {
|
|
Cookies []*http.Cookie
|
|
UserID UserID
|
|
UserHandle UserHandle
|
|
IsAuthenticated bool
|
|
GuestToken string
|
|
CSRFToken string
|
|
}
|
|
|
|
var TWITTER_BASE_URL = url.URL{Scheme: "https", Host: "twitter.com"}
|
|
|
|
func (api API) MarshalJSON() ([]byte, error) {
|
|
result, err := json.Marshal(api_outstruct{
|
|
Cookies: api.Client.Jar.Cookies(&TWITTER_BASE_URL),
|
|
UserID: api.UserID,
|
|
UserHandle: api.UserHandle,
|
|
IsAuthenticated: api.IsAuthenticated,
|
|
GuestToken: api.GuestToken,
|
|
CSRFToken: api.CSRFToken,
|
|
})
|
|
if err != nil {
|
|
return result, fmt.Errorf("Unable to JSONify the api:\n %w", err)
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func (api *API) UnmarshalJSON(data []byte) error {
|
|
var in_struct api_outstruct
|
|
err := json.Unmarshal(data, &in_struct)
|
|
if err != nil {
|
|
return fmt.Errorf("Unable to unmarshal:\n %w", err)
|
|
}
|
|
cookie_jar, err := cookiejar.New(nil)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
for i := range in_struct.Cookies {
|
|
in_struct.Cookies[i].Domain = ".twitter.com"
|
|
}
|
|
cookie_jar.SetCookies(&TWITTER_BASE_URL, in_struct.Cookies)
|
|
api.IsAuthenticated = in_struct.IsAuthenticated
|
|
api.GuestToken = in_struct.GuestToken
|
|
api.UserID = in_struct.UserID
|
|
api.UserHandle = in_struct.UserHandle
|
|
|
|
api.Client = http.Client{
|
|
Timeout: 10 * time.Second,
|
|
Jar: cookie_jar,
|
|
}
|
|
api.CSRFToken = in_struct.CSRFToken
|
|
return nil
|
|
}
|
|
|
|
func (api API) add_authentication_headers(req *http.Request) {
|
|
// Params for every request
|
|
req.Header.Set("Authorization", "Bearer "+BEARER_TOKEN)
|
|
req.Header.Set("x-twitter-client-language", "en")
|
|
|
|
if api.IsAuthenticated {
|
|
if api.CSRFToken == "" {
|
|
panic("No CSRF token set!")
|
|
}
|
|
req.Header.Set("x-csrf-token", api.CSRFToken)
|
|
} else {
|
|
// Not authenticated; use guest token
|
|
if api.GuestToken == "" {
|
|
panic("No guest token set!")
|
|
}
|
|
req.Header.Set("X-Guest-Token", api.GuestToken)
|
|
}
|
|
}
|
|
|
|
func NewGuestSession() (API, error) {
|
|
guestAPIString, err := GetGuestTokenWithRetries(3, 1*time.Second)
|
|
if err != nil {
|
|
return API{}, err
|
|
}
|
|
|
|
jar, err := cookiejar.New(nil)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
return API{
|
|
IsAuthenticated: false,
|
|
GuestToken: guestAPIString,
|
|
Client: http.Client{
|
|
Timeout: 10 * time.Second,
|
|
Jar: jar,
|
|
},
|
|
CSRFToken: "",
|
|
}, nil
|
|
}
|
|
|
|
func (api *API) update_csrf_token() {
|
|
dummyURL, err := url.Parse("https://twitter.com/i/api/1.1/onboarding/task.json")
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
for _, cookie := range api.Client.Jar.Cookies(dummyURL) {
|
|
if cookie.Name == "ct0" {
|
|
api.CSRFToken = cookie.Value
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func is_timeout(err error) bool {
|
|
var urlErr *url.Error
|
|
if errors.As(err, &urlErr) {
|
|
return urlErr.Timeout()
|
|
}
|
|
var netErr net.Error
|
|
if errors.As(err, &netErr) {
|
|
return netErr.Timeout()
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (api *API) do_http_POST(remote_url string, body string, result interface{}) error {
|
|
req, err := http.NewRequest("POST", remote_url, strings.NewReader(body))
|
|
if err != nil {
|
|
return fmt.Errorf("Error initializing HTTP POST request:\n %w", err)
|
|
}
|
|
|
|
if len(body) == 0 || body[0] == '{' { // TODO: unclear what the content-type should be if body is empty; might not matter
|
|
req.Header.Set("content-type", "application/json")
|
|
} else {
|
|
req.Header.Set("content-type", "application/x-www-form-urlencoded")
|
|
}
|
|
|
|
api.add_authentication_headers(req)
|
|
|
|
log.Debugf("POST: %s\n", req.URL.String())
|
|
for header := range req.Header {
|
|
log.Debugf(" %s: %s\n", header, req.Header.Get(header))
|
|
}
|
|
log.Debug(" " + body)
|
|
|
|
resp, err := api.Client.Do(req)
|
|
if is_timeout(err) {
|
|
return fmt.Errorf("POST %q:\n %w", remote_url, ErrRequestTimeout)
|
|
} else if err != nil {
|
|
return fmt.Errorf("Error executing HTTP POST request:\n %w", err)
|
|
}
|
|
api.update_csrf_token()
|
|
|
|
defer resp.Body.Close()
|
|
|
|
respBody, err := io.ReadAll(resp.Body)
|
|
if is_timeout(err) {
|
|
return fmt.Errorf("GET %q:\n reading response body:\n %w", remote_url, ErrRequestTimeout)
|
|
} else if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
if resp.StatusCode == 204 {
|
|
// No Content
|
|
return nil
|
|
}
|
|
|
|
if resp.StatusCode != 200 {
|
|
responseHeaders := ""
|
|
for header := range resp.Header {
|
|
responseHeaders += fmt.Sprintf(" %s: %s\n", header, resp.Header.Get(header))
|
|
}
|
|
return fmt.Errorf("HTTP %s\n%s\n%s", resp.Status, responseHeaders, respBody)
|
|
}
|
|
|
|
log.Debug(string(respBody))
|
|
|
|
err = json.Unmarshal(respBody, result)
|
|
if err != nil {
|
|
return fmt.Errorf("Error parsing API response:\n %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (api *API) do_http(remote_url string, cursor string, result interface{}) error {
|
|
req, err := http.NewRequest("GET", remote_url, nil)
|
|
if err != nil {
|
|
return fmt.Errorf("Error initializing HTTP GET request:\n %w", err)
|
|
}
|
|
|
|
if cursor != "" {
|
|
query := req.URL.Query()
|
|
query.Add("cursor", cursor)
|
|
req.URL.RawQuery = query.Encode()
|
|
}
|
|
|
|
api.add_authentication_headers(req)
|
|
|
|
log.Debugf("GET: %s\n", req.URL.String())
|
|
for header := range req.Header {
|
|
log.Debugf(" %s: %s\n", header, req.Header.Get(header))
|
|
}
|
|
|
|
resp, err := api.Client.Do(req)
|
|
if is_timeout(err) {
|
|
return fmt.Errorf("GET %q:\n %w", remote_url, ErrRequestTimeout)
|
|
} else if err != nil {
|
|
return fmt.Errorf("Error executing HTTP request:\n %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if api.IsAuthenticated {
|
|
// New request has been made, so the cookie will be changed; update the csrf to match
|
|
api.update_csrf_token()
|
|
}
|
|
|
|
if resp.StatusCode == 429 {
|
|
// "Too many requests" => rate limited
|
|
reset_at := TimestampFromUnix(int64(int_or_panic(resp.Header.Get("X-Rate-Limit-Reset"))))
|
|
return fmt.Errorf("%w (resets at %d, which is in %s)", ErrRateLimited, reset_at.Unix(), time.Until(reset_at.Time).String())
|
|
}
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if is_timeout(err) {
|
|
return fmt.Errorf("GET %q:\n reading response body:\n %w", remote_url, ErrRequestTimeout)
|
|
} else if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
if resp.StatusCode != 200 && resp.StatusCode != 403 {
|
|
responseHeaders := ""
|
|
for header := range resp.Header {
|
|
responseHeaders += fmt.Sprintf(" %s: %s\n", header, resp.Header.Get(header))
|
|
}
|
|
return fmt.Errorf("HTTP Error. HTTP %s\n%s\nbody: %s", resp.Status, responseHeaders, body)
|
|
}
|
|
|
|
log.Debug(string(body))
|
|
|
|
err = json.Unmarshal(body, result)
|
|
if err != nil {
|
|
return fmt.Errorf("Error parsing API response:\n %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Add the query params to get all data
|
|
func add_tweet_query_params(query *url.Values) {
|
|
query.Add("include_profile_interstitial_type", "1")
|
|
query.Add("include_blocking", "1")
|
|
query.Add("include_blocked_by", "1")
|
|
query.Add("include_followed_by", "1")
|
|
query.Add("include_want_retweets", "1")
|
|
query.Add("include_mute_edge", "1")
|
|
query.Add("include_can_dm", "1")
|
|
query.Add("include_can_media_tag", "1")
|
|
query.Add("skip_status", "1")
|
|
query.Add("cards_platform", "Web-12")
|
|
query.Add("include_cards", "1")
|
|
query.Add("include_ext_alt_text", "true")
|
|
query.Add("include_quote_count", "true")
|
|
query.Add("include_reply_count", "1")
|
|
query.Add("tweet_mode", "extended")
|
|
query.Add("include_entities", "true")
|
|
query.Add("include_user_entities", "true")
|
|
query.Add("include_ext_media_availability", "true")
|
|
query.Add("send_error_codes", "true")
|
|
query.Add("simple_quoted_tweet", "true")
|
|
query.Add("include_tweet_replies", "true")
|
|
query.Add("ext", "mediaStats,highlightedLabel")
|
|
query.Add("count", "20")
|
|
}
|
|
|
|
func DownloadMedia(url string) ([]byte, error) {
|
|
return the_api.DownloadMedia(url)
|
|
}
|
|
|
|
func (api *API) DownloadMedia(remote_url string) ([]byte, error) {
|
|
fmt.Printf("Downloading: %s\n", remote_url)
|
|
req, err := http.NewRequest("GET", remote_url, nil)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
// api.add_authentication_headers(req)
|
|
// req.Header.Set("Referer", "https://twitter.com/") // DM embedded images require this header
|
|
|
|
resp, err := api.Client.Do(req)
|
|
if is_timeout(err) {
|
|
return []byte{}, fmt.Errorf("GET %q:\n waiting for headers:\n %w", remote_url, ErrRequestTimeout)
|
|
} else if err != nil {
|
|
return []byte{}, fmt.Errorf("Error executing HTTP request:\n %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if api.IsAuthenticated {
|
|
// New request has been made, so the cookie will be changed; update the csrf to match
|
|
api.update_csrf_token()
|
|
}
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if is_timeout(err) {
|
|
return []byte{}, fmt.Errorf("GET %q:\n reading response body:\n %w", remote_url, ErrRequestTimeout)
|
|
} else if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
if resp.StatusCode == 403 {
|
|
var response struct {
|
|
Error_response string `json:"error_response"`
|
|
}
|
|
fmt.Println(string(body))
|
|
|
|
err = json.Unmarshal(body, &response)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
if response.Error_response == "Dmcaed" {
|
|
return body, ErrorDMCA
|
|
}
|
|
// Not a DCMA; fall through
|
|
}
|
|
|
|
if resp.StatusCode == 404 {
|
|
log.Debugf("Media download 404 (%s)", remote_url)
|
|
return body, ErrMediaDownload404
|
|
}
|
|
|
|
if resp.StatusCode != 200 {
|
|
url, err := url.Parse(remote_url)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
print_curl_cmd(*req, api.Client.Jar.Cookies(url))
|
|
|
|
responseHeaders := ""
|
|
for header := range resp.Header {
|
|
responseHeaders += fmt.Sprintf(" %s: %s\n", header, resp.Header.Get(header))
|
|
}
|
|
log.Debug(responseHeaders)
|
|
return body, fmt.Errorf("HTTP Error. HTTP %s\n%s\nbody: %s", resp.Status, responseHeaders, body)
|
|
}
|
|
|
|
// Status code is HTTP 200
|
|
return body, nil
|
|
}
|
|
|
|
func print_curl_cmd(r http.Request, cookies []*http.Cookie) {
|
|
fmt.Printf("curl -X %s %q \\\n", r.Method, r.URL.String())
|
|
for header := range r.Header {
|
|
fmt.Printf(" -H '%s: %s' \\\n", header, r.Header.Get(header))
|
|
}
|
|
fmt.Printf(" -H 'Cookie: ")
|
|
for _, c := range cookies {
|
|
fmt.Printf("%s=%s;", c.Name, c.Value)
|
|
}
|
|
fmt.Printf("' \\\n")
|
|
fmt.Printf(" --compressed\n")
|
|
}
|