diff --git a/cmd/tests.sh b/cmd/tests.sh index 1a1f2e2..310ccbf 100755 --- a/cmd/tests.sh +++ b/cmd/tests.sh @@ -181,6 +181,10 @@ test $(sqlite3 twitter.db "select is_stub from tweets where id = 145452142414465 test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454522147750260742") = 1 +# Test search +tw search "from:michaelmalice constitution" +test $(sqlite3 twitter.db "select count(*) from tweets where user_id = 44067298 and text like '%constitution%'") -gt "30" # Not sure exactly how many + # TODO: Maybe this file should be broken up into multiple test scripts echo -e "\033[32mAll tests passed. Finished successfully.\033[0m" diff --git a/cmd/twitter/helpers.go b/cmd/twitter/helpers.go index d8f71d6..ee22367 100644 --- a/cmd/twitter/helpers.go +++ b/cmd/twitter/helpers.go @@ -46,6 +46,9 @@ This application downloads tweets from twitter and saves them in a SQLite databa is the user handle. Gets the most recent ~50 tweets. If "get_user_tweets_all" is used, gets up to ~3200 tweets (API limit). + + search + is the search query. Should be wrapped in quotes if it has spaces. ` diff --git a/cmd/twitter/main.go b/cmd/twitter/main.go index e2e2548..8a9bf20 100644 --- a/cmd/twitter/main.go +++ b/cmd/twitter/main.go @@ -61,6 +61,8 @@ func main() { fetch_user_feed(target, 999999999) case "download_tweet_content": download_tweet_content(target) + case "search": + search(target) default: die("Invalid operation: " + operation, true, 3) } @@ -238,7 +240,7 @@ func download_tweet_content(tweet_identifier string) { tweet, err := profile.GetTweetById(tweet_id) if err != nil { - panic("Couldn't get tweet from database: " + err.Error()) + panic(fmt.Sprintf("Couldn't get tweet (ID %d) from database: %s", tweet_id, err.Error())) } err = profile.DownloadTweetContentFor(&tweet) if err != nil { @@ -256,3 +258,36 @@ func download_user_content(handle scraper.UserHandle) { panic("Error getting content: " + err.Error()) } } + + +func search(query string) { + tweets, retweets, users, err := scraper.Search(query, 1000); + if err != nil { + die("Error scraping search results: " + err.Error(), false, -100) + } + + for _, u := range users { + err = profile.SaveUser(u) + if err != nil { + die("Error saving user: " + err.Error(), false, 4) + } + err = profile.DownloadUserContentFor(&u) + if err != nil { + die("Error getting user content: " + err.Error(), false, 10) + } + } + + for _, t := range tweets { + // fmt.Println(t) + err = profile.SaveTweet(t) + if err != nil { + die("Error saving tweet: " + err.Error(), false, 4) + } + err = profile.DownloadTweetContentFor(&t) + if err != nil { + die("Error getting tweet content: " + err.Error(), false, 11) + } + } + + fmt.Printf("Saved %d tweets, %d retweets and %d users. Exiting successfully\n", len(tweets), len(retweets), len(users)) +} diff --git a/doc/curl requests b/doc/curl requests index 297a4b8..83424a0 100644 --- a/doc/curl requests +++ b/doc/curl requests @@ -26,3 +26,9 @@ curl \ -H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \ -H "X-Guest-Token: 1396177150890348547" \ https://twitter.com/i/api/2/timeline/conversation/1395881699142160387.json + + +curl \ + -H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" \ + -H "X-Guest-Token: 1449946080792104970" \ + "https://twitter.com/i/api/2/search/adaptive.json?count=50&spelling_corrections=1&query_source=typed_query&pc=1&q=potatoes" diff --git a/scraper/api_request_utils.go b/scraper/api_request_utils.go index 453bb76..cb906fd 100644 --- a/scraper/api_request_utils.go +++ b/scraper/api_request_utils.go @@ -5,6 +5,7 @@ import ( "fmt" "io/ioutil" "net/http" + "net/url" "time" ) @@ -217,6 +218,75 @@ func (api API) GetUser(handle UserHandle) (APIUser, error) { return response.ConvertToAPIUser(), err } +func (api API) Search(query string, cursor string) (TweetResponse, error) { + client := &http.Client{Timeout: 10 * time.Second} + req, err := http.NewRequest("GET", "https://twitter.com/i/api/2/search/adaptive.json?count=50&spelling_corrections=1&query_source=typed_query&pc=1&q=" + url.QueryEscape(query), nil) + if err != nil { + return TweetResponse{}, err + } + + err = ApiRequestAddTokens(req) + if err != nil { + return TweetResponse{}, err + } + + ApiRequestAddAllParams(req) + if cursor != "" { + UpdateQueryCursor(req, cursor, false) + } + + fmt.Println(req.URL.String()) + + resp, err := client.Do(req) + if err != nil { + return TweetResponse{}, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + content, _ := ioutil.ReadAll(resp.Body) + return TweetResponse{}, fmt.Errorf("Error while searching for %q. HTTP %s: %s", req.URL, resp.Status, content) + } + + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return TweetResponse{}, err + } + // fmt.Println(string(body)) + + var response TweetResponse + err = json.Unmarshal(body, &response) + return response, err +} + +func (api API) GetMoreTweetsFromSearch(query string, response *TweetResponse, max_results int) error { + last_response := response + for last_response.GetCursor() != "" && len(response.GlobalObjects.Tweets) < max_results { + fresh_response, err := api.Search(query, last_response.GetCursor()) + if err != nil { + return err + } + if fresh_response.GetCursor() == last_response.GetCursor() && len(fresh_response.GlobalObjects.Tweets) == 0 { + // Empty response, cursor same as previous: end of feed has been reached + return END_OF_FEED + } + + last_response = &fresh_response + + // Copy the results over + for id, tweet := range last_response.GlobalObjects.Tweets { + response.GlobalObjects.Tweets[id] = tweet + } + for id, user := range last_response.GlobalObjects.Users { + response.GlobalObjects.Users[id] = user + } + fmt.Printf("Have %d tweets\n", len(response.GlobalObjects.Tweets)) + // fmt.Printf("Cursor: %s\n", last_response.GetCursor()) + } + return nil +} + + // Add Bearer token and guest token func ApiRequestAddTokens(req *http.Request) error { req.Header.Set("Authorization", "Bearer " + BEARER_TOKEN) diff --git a/scraper/api_types.go b/scraper/api_types.go index 361596b..42b0810 100644 --- a/scraper/api_types.go +++ b/scraper/api_types.go @@ -308,9 +308,18 @@ func (t *TweetResponse) HandleTombstones() []string { func (t *TweetResponse) GetCursor() string { entries := t.Timeline.Instructions[0].AddEntries.Entries - last_entry := entries[len(entries) - 1] - if strings.Contains(last_entry.EntryID, "cursor") { - return last_entry.Content.Operation.Cursor.Value + if len(entries) > 0 { + last_entry := entries[len(entries) - 1] + if strings.Contains(last_entry.EntryID, "cursor") { + return last_entry.Content.Operation.Cursor.Value + } + } + + // Next, try the other format ("replaceEntry") + instructions := t.Timeline.Instructions + last_replace_entry := instructions[len(instructions) - 1].ReplaceEntry.Entry + if strings.Contains(last_replace_entry.EntryID, "cursor") { + return last_replace_entry.Content.Operation.Cursor.Value } return "" }