diff --git a/scraper/api_types.go b/scraper/api_types.go index e4be34b..28a5c4a 100644 --- a/scraper/api_types.go +++ b/scraper/api_types.go @@ -1,6 +1,10 @@ package scraper -import "time" +import ( + "time" + "strings" + "encoding/json" +) type APITweet struct { ID string `json:"id_str"` @@ -24,7 +28,7 @@ type APITweet struct { Mentions []struct { UserName string `json:"screen_name"` UserID string `json:"id_str"` - } + } `json:"user_mentions"` } `json:"entities"` ExtendedEntities struct { Media []struct { @@ -50,6 +54,37 @@ type APITweet struct { UserIDStr string `json:"user_id_str"` } +func (t *APITweet) NormalizeContent() { + // Remove embedded links at the end of the text + if len(t.Entities.URLs) == 1 { + url := t.Entities.URLs[0].URL + if strings.Index(t.FullText, url) == len(t.FullText) - len(url) { + t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the newline + } + } + if len(t.Entities.Media) == 1 { + url := t.Entities.Media[0].URL + if strings.Index(t.FullText, url) == len(t.FullText) - len(url) { + t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the trailing space + } + } + // Remove leading `@username` for replies + if t.InReplyToScreenName != "" { + if strings.Index(t.FullText, "@" + t.InReplyToScreenName) == 0 { + t.FullText = t.FullText[len(t.InReplyToScreenName) + 1:] // `@`, username, space + } + } + t.FullText = strings.TrimSpace(t.FullText) +} + +func (t APITweet) String() string { + data, err := json.Marshal(t) + if err != nil { + panic(err) + } + return string(data) +} + type TweetResponse struct { GlobalObjects struct { Tweets map[string]APITweet `json:"tweets"` @@ -79,8 +114,34 @@ type TweetResponse struct { Verified bool `json:"verified"` } `json:"users"` } `json:"globalObjects"` + Timeline struct { + Instructions []struct { + AddEntries struct { + Entries []struct { + EntryID string `json:"entryId"` + Content struct { + Operation struct { + Cursor struct { + Value string `json:"value"` + } `json:"cursor"` + } `json:"operation` + } `json:"content"` + } `json:"entries"` + } `json:"addEntries"` + } `json:"instructions"` + } `json:"timeline"` } +func (t *TweetResponse) GetCursor() string { + entries := t.Timeline.Instructions[0].AddEntries.Entries + last_entry := entries[len(entries) - 1] + if strings.Contains(last_entry.EntryID, "cursor") { + return last_entry.Content.Operation.Cursor.Value + } + return "" +} + + type UserResponse struct { Data struct { User struct { diff --git a/scraper/api_types_test.go b/scraper/api_types_test.go new file mode 100644 index 0000000..c3bdbd2 --- /dev/null +++ b/scraper/api_types_test.go @@ -0,0 +1,58 @@ +package scraper_test + +import ( + "testing" + "io/ioutil" + "encoding/json" + + "offline_twitter/scraper" +) + + +func TestNormalizeContent(t *testing.T) { + test_cases := []struct { + filename string + eventual_full_text string + } { + {"test_responses/tweet_with_gif_reply.json", ""}, + {"test_responses/tweet_with_image.json", "this saddens me every time"}, + {"test_responses/tweet_with_reply.json", "I always liked \"The Anarchist's Cookbook.\""}, + } + for _, v := range test_cases { + data, err := ioutil.ReadFile(v.filename) + if err != nil { + panic(err) + } + var tweet scraper.APITweet + err = json.Unmarshal(data, &tweet) + if err != nil { + t.Errorf(err.Error()) + } + + tweet.NormalizeContent() + + if tweet.FullText != v.eventual_full_text { + t.Errorf("Expected %q, got %q", v.eventual_full_text, tweet.FullText) + } + } +} + + +func TestGetCursor(t *testing.T) { + data, err := ioutil.ReadFile("test_responses/midriffs_anarchist_cookbook.json") + if err != nil { + panic(err) + } + var tweet_resp scraper.TweetResponse + err = json.Unmarshal(data, &tweet_resp) + if err != nil { + t.Errorf(err.Error()) + } + + expected_cursor := "LBmGhsC+ibH1peAmgICjpbS0m98mgICj7a2lmd8mhsC4rbmsmN8mgMCqkbT1p+AmgsC4ucv4o+AmhoCyrf+nlt8mhMC9qfOwlt8mJQISAAA=" + actual_cursor := tweet_resp.GetCursor() + + if expected_cursor != actual_cursor { + t.Errorf("Expected %q, got %q", expected_cursor, actual_cursor) + } +} diff --git a/scraper/test_responses/tweet_with_gif_reply.json b/scraper/test_responses/tweet_with_gif_reply.json new file mode 100644 index 0000000..6b2f7b2 --- /dev/null +++ b/scraper/test_responses/tweet_with_gif_reply.json @@ -0,0 +1 @@ +{"id_str":"1395902556707057666","conversation_id_str":"1395882872729477131","created_at":"Sat May 22 00:41:18 +0000 2021","favorite_count":0,"full_text":"@michaelmalice https://t.co/VxuTr4wc5H","entities":{"hashtags":null,"media":[{"media_url_https":"https://pbs.twimg.com/tweet_video_thumb/E189-VhVoAYcrDv.jpg","type":"photo","url":"https://t.co/VxuTr4wc5H"}],"urls":null,"Mentions":null},"extended_entities":{"media":[{"id_str":"1395902550646300678","media_url_https":"https://pbs.twimg.com/tweet_video_thumb/E189-VhVoAYcrDv.jpg","type":"animated_gif","video_info":{"variants":[{"url":"https://video.twimg.com/tweet_video/E189-VhVoAYcrDv.mp4"}]}}]},"in_reply_to_status_id_str":"1395882872729477131","in_reply_to_screen_name":"michaelmalice","reply_count":0,"retweet_count":0,"quote_count":0,"retweeted_status_id_str":"","quoted_status_id_str":"","time":"0001-01-01T00:00:00Z","user_id_str":"1227095561494392832"} diff --git a/scraper/test_responses/tweet_with_image.json b/scraper/test_responses/tweet_with_image.json new file mode 100644 index 0000000..6e74f01 --- /dev/null +++ b/scraper/test_responses/tweet_with_image.json @@ -0,0 +1 @@ +{"created_at":"Fri May 21 23:23:05 +0000 2021","id_str":"1395882872729477131","full_text":"this saddens me every time https:\/\/t.co\/jSkwGsbKWv","display_text_range":[0,26],"entities":{"media":[{"id_str":"1395882862289772553","indices":[27,50],"media_url":"http:\/\/pbs.twimg.com\/media\/E18sEUrWYAk8dBl.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E18sEUrWYAk8dBl.jpg","url":"https:\/\/t.co\/jSkwGsbKWv","display_url":"pic.twitter.com\/jSkwGsbKWv","expanded_url":"https:\/\/twitter.com\/michaelmalice\/status\/1395882872729477131\/photo\/1","type":"photo","original_info":{"width":593,"height":239,"focus_rects":[{"x":0,"y":0,"h":239,"w":427},{"x":14,"y":0,"h":239,"w":239},{"x":28,"y":0,"h":239,"w":210},{"x":73,"y":0,"h":239,"w":120},{"x":0,"y":0,"h":239,"w":593}]},"sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":593,"h":239,"resize":"fit"},"medium":{"w":593,"h":239,"resize":"fit"},"small":{"w":593,"h":239,"resize":"fit"}},"features":{"large":{"faces":[]},"medium":{"faces":[]},"small":{"faces":[]},"orig":{"faces":[]}}}]},"extended_entities":{"media":[{"id_str":"1395882862289772553","indices":[27,50],"media_url":"http:\/\/pbs.twimg.com\/media\/E18sEUrWYAk8dBl.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E18sEUrWYAk8dBl.jpg","url":"https:\/\/t.co\/jSkwGsbKWv","display_url":"pic.twitter.com\/jSkwGsbKWv","expanded_url":"https:\/\/twitter.com\/michaelmalice\/status\/1395882872729477131\/photo\/1","type":"photo","original_info":{"width":593,"height":239,"focus_rects":[{"x":0,"y":0,"h":239,"w":427},{"x":14,"y":0,"h":239,"w":239},{"x":28,"y":0,"h":239,"w":210},{"x":73,"y":0,"h":239,"w":120},{"x":0,"y":0,"h":239,"w":593}]},"sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":593,"h":239,"resize":"fit"},"medium":{"w":593,"h":239,"resize":"fit"},"small":{"w":593,"h":239,"resize":"fit"}},"features":{"large":{"faces":[]},"medium":{"faces":[]},"small":{"faces":[]},"orig":{"faces":[]}},"media_key":"3_1395882862289772553","ext_alt_text":null,"ext_media_availability":{"status":"available"},"ext_media_color":{"palette":[{"rgb":{"red":232,"green":245,"blue":254},"percentage":98.18},{"rgb":{"red":112,"green":127,"blue":140},"percentage":1.44},{"rgb":{"red":109,"green":60,"blue":81},"percentage":0.15},{"rgb":{"red":165,"green":169,"blue":164},"percentage":0.15}]},"ext":{"mediaStats":{"r":"Missing","ttl":-1}}}]},"source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eTwitter Web App\u003c\/a\u003e","user_id_str":"44067298","retweet_count":4,"favorite_count":317,"reply_count":27,"quote_count":3,"conversation_id_str":"1395882872729477131","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1395882872729477131"}} diff --git a/scraper/test_responses/tweet_with_reply.json b/scraper/test_responses/tweet_with_reply.json new file mode 100644 index 0000000..fde4dad --- /dev/null +++ b/scraper/test_responses/tweet_with_reply.json @@ -0,0 +1 @@ +{"id_str":"1396194494710788100","conversation_id_str":"1395978577267593218","created_at":"Sat May 22 20:01:22 +0000 2021","favorite_count":1,"full_text":"@michaelmalice I always liked \"The Anarchist's Cookbook.\"","entities":{"hashtags":null,"media":null,"urls":null,"Mentions":null},"extended_entities":{"media":null},"in_reply_to_status_id_str":"1395978577267593218","in_reply_to_screen_name":"michaelmalice","reply_count":2,"retweet_count":0,"quote_count":1,"retweeted_status_id_str":"","quoted_status_id_str":"","time":"0001-01-01T00:00:00Z","user_id_str":"1215108366411931653"}