Add helper methods to API types to help pre-process them
This commit is contained in:
parent
2c8fe25e78
commit
0738f77b55
@ -1,6 +1,10 @@
|
|||||||
package scraper
|
package scraper
|
||||||
|
|
||||||
import "time"
|
import (
|
||||||
|
"time"
|
||||||
|
"strings"
|
||||||
|
"encoding/json"
|
||||||
|
)
|
||||||
|
|
||||||
type APITweet struct {
|
type APITweet struct {
|
||||||
ID string `json:"id_str"`
|
ID string `json:"id_str"`
|
||||||
@ -24,7 +28,7 @@ type APITweet struct {
|
|||||||
Mentions []struct {
|
Mentions []struct {
|
||||||
UserName string `json:"screen_name"`
|
UserName string `json:"screen_name"`
|
||||||
UserID string `json:"id_str"`
|
UserID string `json:"id_str"`
|
||||||
}
|
} `json:"user_mentions"`
|
||||||
} `json:"entities"`
|
} `json:"entities"`
|
||||||
ExtendedEntities struct {
|
ExtendedEntities struct {
|
||||||
Media []struct {
|
Media []struct {
|
||||||
@ -50,6 +54,37 @@ type APITweet struct {
|
|||||||
UserIDStr string `json:"user_id_str"`
|
UserIDStr string `json:"user_id_str"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *APITweet) NormalizeContent() {
|
||||||
|
// Remove embedded links at the end of the text
|
||||||
|
if len(t.Entities.URLs) == 1 {
|
||||||
|
url := t.Entities.URLs[0].URL
|
||||||
|
if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
|
||||||
|
t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the newline
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(t.Entities.Media) == 1 {
|
||||||
|
url := t.Entities.Media[0].URL
|
||||||
|
if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
|
||||||
|
t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the trailing space
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Remove leading `@username` for replies
|
||||||
|
if t.InReplyToScreenName != "" {
|
||||||
|
if strings.Index(t.FullText, "@" + t.InReplyToScreenName) == 0 {
|
||||||
|
t.FullText = t.FullText[len(t.InReplyToScreenName) + 1:] // `@`, username, space
|
||||||
|
}
|
||||||
|
}
|
||||||
|
t.FullText = strings.TrimSpace(t.FullText)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t APITweet) String() string {
|
||||||
|
data, err := json.Marshal(t)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
return string(data)
|
||||||
|
}
|
||||||
|
|
||||||
type TweetResponse struct {
|
type TweetResponse struct {
|
||||||
GlobalObjects struct {
|
GlobalObjects struct {
|
||||||
Tweets map[string]APITweet `json:"tweets"`
|
Tweets map[string]APITweet `json:"tweets"`
|
||||||
@ -79,8 +114,34 @@ type TweetResponse struct {
|
|||||||
Verified bool `json:"verified"`
|
Verified bool `json:"verified"`
|
||||||
} `json:"users"`
|
} `json:"users"`
|
||||||
} `json:"globalObjects"`
|
} `json:"globalObjects"`
|
||||||
|
Timeline struct {
|
||||||
|
Instructions []struct {
|
||||||
|
AddEntries struct {
|
||||||
|
Entries []struct {
|
||||||
|
EntryID string `json:"entryId"`
|
||||||
|
Content struct {
|
||||||
|
Operation struct {
|
||||||
|
Cursor struct {
|
||||||
|
Value string `json:"value"`
|
||||||
|
} `json:"cursor"`
|
||||||
|
} `json:"operation`
|
||||||
|
} `json:"content"`
|
||||||
|
} `json:"entries"`
|
||||||
|
} `json:"addEntries"`
|
||||||
|
} `json:"instructions"`
|
||||||
|
} `json:"timeline"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *TweetResponse) GetCursor() string {
|
||||||
|
entries := t.Timeline.Instructions[0].AddEntries.Entries
|
||||||
|
last_entry := entries[len(entries) - 1]
|
||||||
|
if strings.Contains(last_entry.EntryID, "cursor") {
|
||||||
|
return last_entry.Content.Operation.Cursor.Value
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
type UserResponse struct {
|
type UserResponse struct {
|
||||||
Data struct {
|
Data struct {
|
||||||
User struct {
|
User struct {
|
||||||
|
58
scraper/api_types_test.go
Normal file
58
scraper/api_types_test.go
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
package scraper_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"io/ioutil"
|
||||||
|
"encoding/json"
|
||||||
|
|
||||||
|
"offline_twitter/scraper"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
func TestNormalizeContent(t *testing.T) {
|
||||||
|
test_cases := []struct {
|
||||||
|
filename string
|
||||||
|
eventual_full_text string
|
||||||
|
} {
|
||||||
|
{"test_responses/tweet_with_gif_reply.json", ""},
|
||||||
|
{"test_responses/tweet_with_image.json", "this saddens me every time"},
|
||||||
|
{"test_responses/tweet_with_reply.json", "I always liked \"The Anarchist's Cookbook.\""},
|
||||||
|
}
|
||||||
|
for _, v := range test_cases {
|
||||||
|
data, err := ioutil.ReadFile(v.filename)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
var tweet scraper.APITweet
|
||||||
|
err = json.Unmarshal(data, &tweet)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf(err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
tweet.NormalizeContent()
|
||||||
|
|
||||||
|
if tweet.FullText != v.eventual_full_text {
|
||||||
|
t.Errorf("Expected %q, got %q", v.eventual_full_text, tweet.FullText)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
func TestGetCursor(t *testing.T) {
|
||||||
|
data, err := ioutil.ReadFile("test_responses/midriffs_anarchist_cookbook.json")
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
var tweet_resp scraper.TweetResponse
|
||||||
|
err = json.Unmarshal(data, &tweet_resp)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf(err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
expected_cursor := "LBmGhsC+ibH1peAmgICjpbS0m98mgICj7a2lmd8mhsC4rbmsmN8mgMCqkbT1p+AmgsC4ucv4o+AmhoCyrf+nlt8mhMC9qfOwlt8mJQISAAA="
|
||||||
|
actual_cursor := tweet_resp.GetCursor()
|
||||||
|
|
||||||
|
if expected_cursor != actual_cursor {
|
||||||
|
t.Errorf("Expected %q, got %q", expected_cursor, actual_cursor)
|
||||||
|
}
|
||||||
|
}
|
1
scraper/test_responses/tweet_with_gif_reply.json
Normal file
1
scraper/test_responses/tweet_with_gif_reply.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"id_str":"1395902556707057666","conversation_id_str":"1395882872729477131","created_at":"Sat May 22 00:41:18 +0000 2021","favorite_count":0,"full_text":"@michaelmalice https://t.co/VxuTr4wc5H","entities":{"hashtags":null,"media":[{"media_url_https":"https://pbs.twimg.com/tweet_video_thumb/E189-VhVoAYcrDv.jpg","type":"photo","url":"https://t.co/VxuTr4wc5H"}],"urls":null,"Mentions":null},"extended_entities":{"media":[{"id_str":"1395902550646300678","media_url_https":"https://pbs.twimg.com/tweet_video_thumb/E189-VhVoAYcrDv.jpg","type":"animated_gif","video_info":{"variants":[{"url":"https://video.twimg.com/tweet_video/E189-VhVoAYcrDv.mp4"}]}}]},"in_reply_to_status_id_str":"1395882872729477131","in_reply_to_screen_name":"michaelmalice","reply_count":0,"retweet_count":0,"quote_count":0,"retweeted_status_id_str":"","quoted_status_id_str":"","time":"0001-01-01T00:00:00Z","user_id_str":"1227095561494392832"}
|
1
scraper/test_responses/tweet_with_image.json
Normal file
1
scraper/test_responses/tweet_with_image.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"created_at":"Fri May 21 23:23:05 +0000 2021","id_str":"1395882872729477131","full_text":"this saddens me every time https:\/\/t.co\/jSkwGsbKWv","display_text_range":[0,26],"entities":{"media":[{"id_str":"1395882862289772553","indices":[27,50],"media_url":"http:\/\/pbs.twimg.com\/media\/E18sEUrWYAk8dBl.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E18sEUrWYAk8dBl.jpg","url":"https:\/\/t.co\/jSkwGsbKWv","display_url":"pic.twitter.com\/jSkwGsbKWv","expanded_url":"https:\/\/twitter.com\/michaelmalice\/status\/1395882872729477131\/photo\/1","type":"photo","original_info":{"width":593,"height":239,"focus_rects":[{"x":0,"y":0,"h":239,"w":427},{"x":14,"y":0,"h":239,"w":239},{"x":28,"y":0,"h":239,"w":210},{"x":73,"y":0,"h":239,"w":120},{"x":0,"y":0,"h":239,"w":593}]},"sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":593,"h":239,"resize":"fit"},"medium":{"w":593,"h":239,"resize":"fit"},"small":{"w":593,"h":239,"resize":"fit"}},"features":{"large":{"faces":[]},"medium":{"faces":[]},"small":{"faces":[]},"orig":{"faces":[]}}}]},"extended_entities":{"media":[{"id_str":"1395882862289772553","indices":[27,50],"media_url":"http:\/\/pbs.twimg.com\/media\/E18sEUrWYAk8dBl.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E18sEUrWYAk8dBl.jpg","url":"https:\/\/t.co\/jSkwGsbKWv","display_url":"pic.twitter.com\/jSkwGsbKWv","expanded_url":"https:\/\/twitter.com\/michaelmalice\/status\/1395882872729477131\/photo\/1","type":"photo","original_info":{"width":593,"height":239,"focus_rects":[{"x":0,"y":0,"h":239,"w":427},{"x":14,"y":0,"h":239,"w":239},{"x":28,"y":0,"h":239,"w":210},{"x":73,"y":0,"h":239,"w":120},{"x":0,"y":0,"h":239,"w":593}]},"sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":593,"h":239,"resize":"fit"},"medium":{"w":593,"h":239,"resize":"fit"},"small":{"w":593,"h":239,"resize":"fit"}},"features":{"large":{"faces":[]},"medium":{"faces":[]},"small":{"faces":[]},"orig":{"faces":[]}},"media_key":"3_1395882862289772553","ext_alt_text":null,"ext_media_availability":{"status":"available"},"ext_media_color":{"palette":[{"rgb":{"red":232,"green":245,"blue":254},"percentage":98.18},{"rgb":{"red":112,"green":127,"blue":140},"percentage":1.44},{"rgb":{"red":109,"green":60,"blue":81},"percentage":0.15},{"rgb":{"red":165,"green":169,"blue":164},"percentage":0.15}]},"ext":{"mediaStats":{"r":"Missing","ttl":-1}}}]},"source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eTwitter Web App\u003c\/a\u003e","user_id_str":"44067298","retweet_count":4,"favorite_count":317,"reply_count":27,"quote_count":3,"conversation_id_str":"1395882872729477131","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1395882872729477131"}}
|
1
scraper/test_responses/tweet_with_reply.json
Normal file
1
scraper/test_responses/tweet_with_reply.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"id_str":"1396194494710788100","conversation_id_str":"1395978577267593218","created_at":"Sat May 22 20:01:22 +0000 2021","favorite_count":1,"full_text":"@michaelmalice I always liked \"The Anarchist's Cookbook.\"","entities":{"hashtags":null,"media":null,"urls":null,"Mentions":null},"extended_entities":{"media":null},"in_reply_to_status_id_str":"1395978577267593218","in_reply_to_screen_name":"michaelmalice","reply_count":2,"retweet_count":0,"quote_count":1,"retweeted_status_id_str":"","quoted_status_id_str":"","time":"0001-01-01T00:00:00Z","user_id_str":"1215108366411931653"}
|
Loading…
x
Reference in New Issue
Block a user