Add helper methods to API types to help pre-process them

This commit is contained in:
Alessio 2021-05-23 20:58:31 -04:00
parent 2c8fe25e78
commit 0738f77b55
5 changed files with 124 additions and 2 deletions

View File

@ -1,6 +1,10 @@
package scraper package scraper
import "time" import (
"time"
"strings"
"encoding/json"
)
type APITweet struct { type APITweet struct {
ID string `json:"id_str"` ID string `json:"id_str"`
@ -24,7 +28,7 @@ type APITweet struct {
Mentions []struct { Mentions []struct {
UserName string `json:"screen_name"` UserName string `json:"screen_name"`
UserID string `json:"id_str"` UserID string `json:"id_str"`
} } `json:"user_mentions"`
} `json:"entities"` } `json:"entities"`
ExtendedEntities struct { ExtendedEntities struct {
Media []struct { Media []struct {
@ -50,6 +54,37 @@ type APITweet struct {
UserIDStr string `json:"user_id_str"` UserIDStr string `json:"user_id_str"`
} }
func (t *APITweet) NormalizeContent() {
// Remove embedded links at the end of the text
if len(t.Entities.URLs) == 1 {
url := t.Entities.URLs[0].URL
if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the newline
}
}
if len(t.Entities.Media) == 1 {
url := t.Entities.Media[0].URL
if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the trailing space
}
}
// Remove leading `@username` for replies
if t.InReplyToScreenName != "" {
if strings.Index(t.FullText, "@" + t.InReplyToScreenName) == 0 {
t.FullText = t.FullText[len(t.InReplyToScreenName) + 1:] // `@`, username, space
}
}
t.FullText = strings.TrimSpace(t.FullText)
}
func (t APITweet) String() string {
data, err := json.Marshal(t)
if err != nil {
panic(err)
}
return string(data)
}
type TweetResponse struct { type TweetResponse struct {
GlobalObjects struct { GlobalObjects struct {
Tweets map[string]APITweet `json:"tweets"` Tweets map[string]APITweet `json:"tweets"`
@ -79,8 +114,34 @@ type TweetResponse struct {
Verified bool `json:"verified"` Verified bool `json:"verified"`
} `json:"users"` } `json:"users"`
} `json:"globalObjects"` } `json:"globalObjects"`
Timeline struct {
Instructions []struct {
AddEntries struct {
Entries []struct {
EntryID string `json:"entryId"`
Content struct {
Operation struct {
Cursor struct {
Value string `json:"value"`
} `json:"cursor"`
} `json:"operation`
} `json:"content"`
} `json:"entries"`
} `json:"addEntries"`
} `json:"instructions"`
} `json:"timeline"`
} }
func (t *TweetResponse) GetCursor() string {
entries := t.Timeline.Instructions[0].AddEntries.Entries
last_entry := entries[len(entries) - 1]
if strings.Contains(last_entry.EntryID, "cursor") {
return last_entry.Content.Operation.Cursor.Value
}
return ""
}
type UserResponse struct { type UserResponse struct {
Data struct { Data struct {
User struct { User struct {

58
scraper/api_types_test.go Normal file
View File

@ -0,0 +1,58 @@
package scraper_test
import (
"testing"
"io/ioutil"
"encoding/json"
"offline_twitter/scraper"
)
func TestNormalizeContent(t *testing.T) {
test_cases := []struct {
filename string
eventual_full_text string
} {
{"test_responses/tweet_with_gif_reply.json", ""},
{"test_responses/tweet_with_image.json", "this saddens me every time"},
{"test_responses/tweet_with_reply.json", "I always liked \"The Anarchist's Cookbook.\""},
}
for _, v := range test_cases {
data, err := ioutil.ReadFile(v.filename)
if err != nil {
panic(err)
}
var tweet scraper.APITweet
err = json.Unmarshal(data, &tweet)
if err != nil {
t.Errorf(err.Error())
}
tweet.NormalizeContent()
if tweet.FullText != v.eventual_full_text {
t.Errorf("Expected %q, got %q", v.eventual_full_text, tweet.FullText)
}
}
}
func TestGetCursor(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/midriffs_anarchist_cookbook.json")
if err != nil {
panic(err)
}
var tweet_resp scraper.TweetResponse
err = json.Unmarshal(data, &tweet_resp)
if err != nil {
t.Errorf(err.Error())
}
expected_cursor := "LBmGhsC+ibH1peAmgICjpbS0m98mgICj7a2lmd8mhsC4rbmsmN8mgMCqkbT1p+AmgsC4ucv4o+AmhoCyrf+nlt8mhMC9qfOwlt8mJQISAAA="
actual_cursor := tweet_resp.GetCursor()
if expected_cursor != actual_cursor {
t.Errorf("Expected %q, got %q", expected_cursor, actual_cursor)
}
}

View File

@ -0,0 +1 @@
{"id_str":"1395902556707057666","conversation_id_str":"1395882872729477131","created_at":"Sat May 22 00:41:18 +0000 2021","favorite_count":0,"full_text":"@michaelmalice https://t.co/VxuTr4wc5H","entities":{"hashtags":null,"media":[{"media_url_https":"https://pbs.twimg.com/tweet_video_thumb/E189-VhVoAYcrDv.jpg","type":"photo","url":"https://t.co/VxuTr4wc5H"}],"urls":null,"Mentions":null},"extended_entities":{"media":[{"id_str":"1395902550646300678","media_url_https":"https://pbs.twimg.com/tweet_video_thumb/E189-VhVoAYcrDv.jpg","type":"animated_gif","video_info":{"variants":[{"url":"https://video.twimg.com/tweet_video/E189-VhVoAYcrDv.mp4"}]}}]},"in_reply_to_status_id_str":"1395882872729477131","in_reply_to_screen_name":"michaelmalice","reply_count":0,"retweet_count":0,"quote_count":0,"retweeted_status_id_str":"","quoted_status_id_str":"","time":"0001-01-01T00:00:00Z","user_id_str":"1227095561494392832"}

View File

@ -0,0 +1 @@
{"created_at":"Fri May 21 23:23:05 +0000 2021","id_str":"1395882872729477131","full_text":"this saddens me every time https:\/\/t.co\/jSkwGsbKWv","display_text_range":[0,26],"entities":{"media":[{"id_str":"1395882862289772553","indices":[27,50],"media_url":"http:\/\/pbs.twimg.com\/media\/E18sEUrWYAk8dBl.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E18sEUrWYAk8dBl.jpg","url":"https:\/\/t.co\/jSkwGsbKWv","display_url":"pic.twitter.com\/jSkwGsbKWv","expanded_url":"https:\/\/twitter.com\/michaelmalice\/status\/1395882872729477131\/photo\/1","type":"photo","original_info":{"width":593,"height":239,"focus_rects":[{"x":0,"y":0,"h":239,"w":427},{"x":14,"y":0,"h":239,"w":239},{"x":28,"y":0,"h":239,"w":210},{"x":73,"y":0,"h":239,"w":120},{"x":0,"y":0,"h":239,"w":593}]},"sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":593,"h":239,"resize":"fit"},"medium":{"w":593,"h":239,"resize":"fit"},"small":{"w":593,"h":239,"resize":"fit"}},"features":{"large":{"faces":[]},"medium":{"faces":[]},"small":{"faces":[]},"orig":{"faces":[]}}}]},"extended_entities":{"media":[{"id_str":"1395882862289772553","indices":[27,50],"media_url":"http:\/\/pbs.twimg.com\/media\/E18sEUrWYAk8dBl.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E18sEUrWYAk8dBl.jpg","url":"https:\/\/t.co\/jSkwGsbKWv","display_url":"pic.twitter.com\/jSkwGsbKWv","expanded_url":"https:\/\/twitter.com\/michaelmalice\/status\/1395882872729477131\/photo\/1","type":"photo","original_info":{"width":593,"height":239,"focus_rects":[{"x":0,"y":0,"h":239,"w":427},{"x":14,"y":0,"h":239,"w":239},{"x":28,"y":0,"h":239,"w":210},{"x":73,"y":0,"h":239,"w":120},{"x":0,"y":0,"h":239,"w":593}]},"sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":593,"h":239,"resize":"fit"},"medium":{"w":593,"h":239,"resize":"fit"},"small":{"w":593,"h":239,"resize":"fit"}},"features":{"large":{"faces":[]},"medium":{"faces":[]},"small":{"faces":[]},"orig":{"faces":[]}},"media_key":"3_1395882862289772553","ext_alt_text":null,"ext_media_availability":{"status":"available"},"ext_media_color":{"palette":[{"rgb":{"red":232,"green":245,"blue":254},"percentage":98.18},{"rgb":{"red":112,"green":127,"blue":140},"percentage":1.44},{"rgb":{"red":109,"green":60,"blue":81},"percentage":0.15},{"rgb":{"red":165,"green":169,"blue":164},"percentage":0.15}]},"ext":{"mediaStats":{"r":"Missing","ttl":-1}}}]},"source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eTwitter Web App\u003c\/a\u003e","user_id_str":"44067298","retweet_count":4,"favorite_count":317,"reply_count":27,"quote_count":3,"conversation_id_str":"1395882872729477131","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1395882872729477131"}}

View File

@ -0,0 +1 @@
{"id_str":"1396194494710788100","conversation_id_str":"1395978577267593218","created_at":"Sat May 22 20:01:22 +0000 2021","favorite_count":1,"full_text":"@michaelmalice I always liked \"The Anarchist's Cookbook.\"","entities":{"hashtags":null,"media":null,"urls":null,"Mentions":null},"extended_entities":{"media":null},"in_reply_to_status_id_str":"1395978577267593218","in_reply_to_screen_name":"michaelmalice","reply_count":2,"retweet_count":0,"quote_count":1,"retweeted_status_id_str":"","quoted_status_id_str":"","time":"0001-01-01T00:00:00Z","user_id_str":"1215108366411931653"}