diff --git a/scraper/api_types.go b/scraper/api_types.go index 6ac6990..fb3afab 100644 --- a/scraper/api_types.go +++ b/scraper/api_types.go @@ -32,6 +32,35 @@ type APIExtendedMedia struct { } `json:"video_info"` } +type APICard struct { + BindingValues struct { + Domain struct { + Value string `json:"string_value"` + } `json:"domain"` + Creator struct { + UserValue struct { + Value int64 `json:"id_str,string"` + } `json:"user_value"` + } `json:"creator"` + Site struct { + UserValue struct { + Value int64 `json:"id_str,string"` + } `json:"user_value"` + } `json:"site"` + Title struct { + Value string `json:"string_value"` + } `json:"title"` + Description struct { + Value string `json:"string_value"` + } `json:"description"` + Thumbnail struct { + ImageValue struct { + Url string `json:"url"` + } `json:"image_value"` + } `json:"thumbnail_image_large"` + } `json:"binding_values"` +} + type APITweet struct { ID int64 `json:"id_str,string"` ConversationID int64 `json:"conversation_id_str,string"` @@ -66,6 +95,7 @@ type APITweet struct { QuotedStatusID int64 Time time.Time `json:"time"` UserID int64 `json:"user_id_str,string"` + Card APICard `json:"card"` } func (t *APITweet) NormalizeContent() { diff --git a/scraper/test_responses/url_card.json b/scraper/test_responses/url_card.json new file mode 100644 index 0000000..8aac938 --- /dev/null +++ b/scraper/test_responses/url_card.json @@ -0,0 +1 @@ +{"name":"summary_large_image","url":"https://t.co/Y1lWjNEiPK","card_type_url":"http://card-type-url-is-deprecated.invalid","binding_values":{"vanity_url":{"type":"STRING","string_value":"reason.com","scribe_key":"vanity_url"},"amp":{"type":"BOOLEAN","boolean_value":true},"domain":{"type":"STRING","string_value":"reason.com"},"creator":{"type":"USER","user_value":{"id_str":"155581583","path":[]}},"site":{"type":"USER","user_value":{"id_str":"16467567","path":[]},"scribe_key":"publisher_id"},"title":{"type":"STRING","string_value":"L.A. Teachers Union Leader: 'There's No Such Thing As Learning Loss'"},"description":{"type":"STRING","string_value":"\"It’s OK that our babies may not have learned all their times tables,\" says Cecily Myart-Cruz. \"They learned resilience.\""},"thumbnail_image_small":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=144x144","width":144,"height":76}},"thumbnail_image":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=280x150","width":280,"height":147}},"thumbnail_image_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x600","width":600,"height":315}},"thumbnail_image_x_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=png&name=2048x2048_2_exp","width":1200,"height":630}},"thumbnail_image_original":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=orig","width":1200,"height":630}},"summary_photo_image_small":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=386x202","width":386,"height":202}},"summary_photo_image":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x314","width":600,"height":314}},"summary_photo_image_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=800x419","width":800,"height":419}},"summary_photo_image_x_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=png&name=2048x2048_2_exp","width":1200,"height":630}},"summary_photo_image_original":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=orig","width":1200,"height":630}},"photo_image_full_size_small":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=386x202","width":386,"height":202}},"photo_image_full_size":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x314","width":600,"height":314}},"photo_image_full_size_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=800x419","width":800,"height":419}},"photo_image_full_size_x_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=png&name=2048x2048_2_exp","width":1200,"height":630}},"photo_image_full_size_original":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=orig","width":1200,"height":630}},"card_url":{"type":"STRING","string_value":"https://t.co/Y1lWjNEiPK","scribe_key":"card_url"}},"users":{"155581583":{"id_str":"155581583","name":"Robby Soave","screen_name":"robbysoave","location":"","description":"@reason senior editor and \"noted scooter anthropologist.\" Author of \"Panic Attack\" and \"Tech Panic.\" Pre-order now: https://t.co/Mb5iBIZhcP","entities":{"description":{"urls":[{"url":"https://t.co/Mb5iBIZhcP","expanded_url":"http://tinyurl.com/rt2zftny","display_url":"tinyurl.com/rt2zftny","indices":[116,139]}]}},"followers_count":81373,"fast_followers_count":0,"normal_followers_count":81373,"friends_count":1807,"listed_count":979,"created_at":"Mon Jun 14 14:45:39 +0000 2010","favourites_count":7920,"geo_enabled":true,"verified":true,"statuses_count":12184,"media_count":1139,"profile_image_url_https":"https://pbs.twimg.com/profile_images/1423673056371871744/NKzapFP-_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/155581583/1628264486","profile_image_extensions_alt_text":null,"profile_image_extensions_media_availability":null,"profile_image_extensions_media_color":{"palette":[{"rgb":{"red":188,"green":188,"blue":187},"percentage":54.64},{"rgb":{"red":40,"green":38,"blue":34},"percentage":19.13},{"rgb":{"red":210,"green":152,"blue":114},"percentage":15.07},{"rgb":{"red":105,"green":68,"blue":43},"percentage":8.59},{"rgb":{"red":182,"green":104,"blue":84},"percentage":0.36}]},"profile_image_extensions":{"mediaStats":{"r":{"missing":null},"ttl":-1}},"profile_banner_extensions_alt_text":null,"profile_banner_extensions_media_availability":null,"profile_banner_extensions_media_color":{"palette":[{"rgb":{"red":155,"green":28,"blue":40},"percentage":80.27},{"rgb":{"red":188,"green":133,"blue":139},"percentage":9.65},{"rgb":{"red":236,"green":44,"blue":58},"percentage":8.05},{"rgb":{"red":208,"green":175,"blue":178},"percentage":1.14},{"rgb":{"red":31,"green":27,"blue":27},"percentage":0.4}]},"profile_banner_extensions":{"mediaStats":{"r":{"missing":null},"ttl":-1}},"profile_link_color":"0084B4","has_extended_profile":true,"pinned_tweet_ids":[1435296810638233602],"pinned_tweet_ids_str":["1435296810638233602"],"has_custom_timelines":true,"advertiser_account_type":"promotable_user","advertiser_account_service_levels":[],"profile_interstitial_type":"","business_profile_state":"none","translator_type":"none","withheld_in_countries":[],"ext":{"highlightedLabel":{"r":{"ok":{}},"ttl":-1}}},"16467567":{"id_str":"16467567","name":"reason","screen_name":"reason","location":"Washington, DC and Los Angeles","description":"Reason is the monthly magazine and website of “free minds and free markets” published by @ReasonFdn.","url":"https://t.co/W2vfk5WIWy","entities":{"url":{"urls":[{"url":"https://t.co/W2vfk5WIWy","expanded_url":"http://reason.com","display_url":"reason.com","indices":[0,23]}]},"description":{}},"followers_count":265717,"fast_followers_count":0,"normal_followers_count":265717,"friends_count":365,"listed_count":6456,"created_at":"Fri Sep 26 13:31:17 +0000 2008","favourites_count":208,"verified":true,"statuses_count":91534,"media_count":2538,"profile_image_url_https":"https://pbs.twimg.com/profile_images/943872166101000192/JIdyYi7P_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/16467567/1629580544","profile_image_extensions_alt_text":null,"profile_image_extensions_media_availability":null,"profile_image_extensions_media_color":{"palette":[{"rgb":{"red":242,"green":107,"blue":52},"percentage":92.62},{"rgb":{"red":255,"green":255,"blue":255},"percentage":7.15},{"rgb":{"red":249,"green":193,"blue":168},"percentage":0.15}]},"profile_image_extensions":{"mediaStats":{"r":{"missing":null},"ttl":-1}},"profile_banner_extensions_alt_text":null,"profile_banner_extensions_media_availability":null,"profile_banner_extensions_media_color":{"palette":[{"rgb":{"red":229,"green":179,"blue":1},"percentage":46.93},{"rgb":{"red":11,"green":9,"blue":4},"percentage":19.06},{"rgb":{"red":241,"green":222,"blue":87},"percentage":8.48},{"rgb":{"red":144,"green":115,"blue":28},"percentage":8.16},{"rgb":{"red":238,"green":151,"blue":3},"percentage":5.56}]},"profile_banner_extensions":{"mediaStats":{"r":{"missing":null},"ttl":-1}},"profile_link_color":"FF6C2F","pinned_tweet_ids":[],"pinned_tweet_ids_str":[],"advertiser_account_type":"promotable_user","advertiser_account_service_levels":["smb","media_studio"],"profile_interstitial_type":"","business_profile_state":"none","translator_type":"none","withheld_in_countries":[],"ext":{"highlightedLabel":{"r":{"ok":{}},"ttl":-1}}}},"card_platform":{"platform":{"device":{"name":"Swift","version":"12"},"audience":{"name":"production"}}}} diff --git a/scraper/url.go b/scraper/url.go new file mode 100644 index 0000000..c543bb0 --- /dev/null +++ b/scraper/url.go @@ -0,0 +1,47 @@ +package scraper + +import ( + "fmt" + "path" + "net/url" +) + +type Url struct { + Domain string + Text string + Title string + Description string + ThumbnailRemoteUrl string + ThumbnailLocalPath string + CreatorID UserID + SiteID UserID + + IsContentDownloaded bool +} + +func ParseAPIUrlCard(apiCard APICard) Url { + values := apiCard.BindingValues + return Url{ + Domain: values.Domain.Value, + Title: values.Title.Value, + Description: values.Description.Value, + ThumbnailRemoteUrl: values.Thumbnail.ImageValue.Url, + ThumbnailLocalPath: get_thumbnail_local_path(values.Thumbnail.ImageValue.Url), + CreatorID: UserID(values.Creator.UserValue.Value), + SiteID: UserID(values.Site.UserValue.Value), + IsContentDownloaded: false, + } +} + +func get_thumbnail_local_path(remote_url string) string { + u, err := url.Parse(remote_url) + if err != nil { + panic(err) + } + query_params, err := url.ParseQuery(u.RawQuery) + if err != nil { + panic(err) + } + + return fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0]) +} diff --git a/scraper/url_test.go b/scraper/url_test.go new file mode 100644 index 0000000..055dbb1 --- /dev/null +++ b/scraper/url_test.go @@ -0,0 +1,54 @@ +package scraper_test + +import ( + "testing" + "io/ioutil" + "encoding/json" + + "offline_twitter/scraper" +) + +func TestParseAPIUrlCard(t *testing.T) { + data, err := ioutil.ReadFile("test_responses/url_card.json") + if err != nil { + panic(err) + } + var apiCard scraper.APICard + err = json.Unmarshal(data, &apiCard) + if err != nil { + t.Fatal(err.Error()) + } + url := scraper.ParseAPIUrlCard(apiCard) + + expected_domain := "reason.com" + if url.Domain != expected_domain { + t.Errorf("Expected %q, got %q", expected_domain, url.Domain) + } + expected_title := "L.A. Teachers Union Leader: 'There's No Such Thing As Learning Loss'" + if url.Title != expected_title { + t.Errorf("Expected %q, got %q", expected_title, url.Title) + } + expected_description := "\"It’s OK that our babies may not have learned all their times tables,\" says Cecily Myart-Cruz. \"They learned resilience.\"" + if url.Description != expected_description { + t.Errorf("Expected %q, got %q", expected_description, url.Description) + } + expected_remote_url := "https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x600" + if url.ThumbnailRemoteUrl != expected_remote_url { + t.Errorf("Expected %q, got %q", expected_remote_url, url.ThumbnailRemoteUrl) + } + expected_local_filename := "odDi9EqO_600x600.jpg" + if url.ThumbnailLocalPath != expected_local_filename { + t.Errorf("Expected %q, got %q", expected_local_filename, url.ThumbnailLocalPath) + } + expected_creator_id := scraper.UserID(155581583) + if url.CreatorID != expected_creator_id { + t.Errorf("Expected %d, got %d", expected_creator_id, url.CreatorID) + } + expected_site_id := scraper.UserID(16467567) + if url.SiteID != expected_site_id { + t.Errorf("Expected %d, got %d", expected_site_id, url.SiteID) + } + if url.IsContentDownloaded { + t.Errorf("Expected it not to be downloaded, but it was") + } +}