From 494ca25dc4a90fd11e7e092b74c0e6b6bdebe868 Mon Sep 17 00:00:00 2001 From: Alessio Date: Tue, 3 Aug 2021 17:34:44 -0700 Subject: [PATCH] Change method of scraping images - break `Filename` field on Image into `RemoteURL` and `LocalFilename` - new parsing function for an APIMedia item --- cmd/tests.sh | 2 ++ scraper/api_types.go | 12 +++++----- scraper/image.go | 12 ++++++++++ scraper/image_test.go | 37 +++++++++++++++++++++++++++++++ scraper/test_responses/image.json | 1 + scraper/tweet.go | 5 +++-- 6 files changed, 62 insertions(+), 7 deletions(-) create mode 100644 scraper/image_test.go create mode 100644 scraper/test_responses/image.json diff --git a/cmd/tests.sh b/cmd/tests.sh index 14eb071..a2400f4 100755 --- a/cmd/tests.sh +++ b/cmd/tests.sh @@ -3,6 +3,8 @@ set -e set -x +PS4='+(${BASH_SOURCE}:${LINENO}): ' + test -e data && rm -r data go run ./twitter create_profile data diff --git a/scraper/api_types.go b/scraper/api_types.go index fe8994d..47d3b78 100644 --- a/scraper/api_types.go +++ b/scraper/api_types.go @@ -14,6 +14,12 @@ func (v SortableVariants) Len() int { return len(v) } func (v SortableVariants) Swap(i, j int) { v[i], v[j] = v[j], v[i] } func (v SortableVariants) Less(i, j int) bool { return v[i].Bitrate > v[j].Bitrate } +type APIMedia struct { + MediaURLHttps string `json:"media_url_https"` + Type string `json:"type"` + URL string `json:"url"` +} + type APITweet struct { ID string `json:"id_str"` ConversationIDStr string `json:"conversation_id_str"` @@ -24,11 +30,7 @@ type APITweet struct { Hashtags []struct { Text string `json:"text"` } `json:"hashtags"` - Media []struct { - MediaURLHttps string `json:"media_url_https"` - Type string `json:"type"` - URL string `json:"url"` - } `json:"media"` + Media []APIMedia `json:"media"` URLs []struct { ExpandedURL string `json:"expanded_url"` URL string `json:"url"` diff --git a/scraper/image.go b/scraper/image.go index 3427207..1a104f6 100644 --- a/scraper/image.go +++ b/scraper/image.go @@ -10,9 +10,21 @@ type Image struct { ID ImageID TweetID TweetID Filename string + RemoteURL string + LocalFilename string IsDownloaded bool } +func ParseAPIMedia(apiMedia APIMedia) Image { + local_filename := path.Base(apiMedia.MediaURLHttps) + return Image{ + Filename: apiMedia.MediaURLHttps, // XXX filename + RemoteURL: apiMedia.MediaURLHttps, + LocalFilename: local_filename, + IsDownloaded: false, + } +} + func (img Image) FilenameWhenDownloaded() string { return path.Base(img.Filename) } diff --git a/scraper/image_test.go b/scraper/image_test.go new file mode 100644 index 0000000..00deec3 --- /dev/null +++ b/scraper/image_test.go @@ -0,0 +1,37 @@ +package scraper_test + +import ( + "testing" + "io/ioutil" + "encoding/json" + + "offline_twitter/scraper" +) + +func TestParseAPIMedia(t *testing.T) { + data, err := ioutil.ReadFile("test_responses/image.json") + if err != nil { + panic(err) + } + var apimedia scraper.APIMedia + err = json.Unmarshal(data, &apimedia) + if err != nil { + t.Fatal(err.Error()) + } + image := scraper.ParseAPIMedia(apimedia) + + expected_remote_url := "https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg" + if image.RemoteURL != expected_remote_url { + t.Errorf("Expected %q, got %q", expected_remote_url, image.RemoteURL) + } + if image.Filename != expected_remote_url { // XXX filename: delete this check + t.Errorf("Expected %q, got %q", expected_remote_url, image.Filename) + } + expected_local_filename := "E18sEUrWYAk8dBl.jpg" + if image.LocalFilename != expected_local_filename { + t.Errorf("Expected %q, got %q", expected_local_filename, image.LocalFilename) + } + if image.IsDownloaded { + t.Errorf("Expected it not to be downloaded, but it was") + } +} diff --git a/scraper/test_responses/image.json b/scraper/test_responses/image.json new file mode 100644 index 0000000..075e0d8 --- /dev/null +++ b/scraper/test_responses/image.json @@ -0,0 +1 @@ +{"id_str":"1395882862289772553","indices":[27,50],"media_url":"http://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg","media_url_https":"https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg","url":"https://t.co/jSkwGsbKWv","display_url":"pic.twitter.com/jSkwGsbKWv","expanded_url":"https://twitter.com/michaelmalice/status/1395882872729477131/photo/1","type":"photo","original_info":{"width":593,"height":239,"focus_rects":[{"x":0,"y":0,"h":239,"w":427},{"x":14,"y":0,"h":239,"w":239},{"x":28,"y":0,"h":239,"w":210},{"x":73,"y":0,"h":239,"w":120},{"x":0,"y":0,"h":239,"w":593}]},"sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":593,"h":239,"resize":"fit"},"medium":{"w":593,"h":239,"resize":"fit"},"small":{"w":593,"h":239,"resize":"fit"}},"features":{"large":{"faces":[]},"medium":{"faces":[]},"small":{"faces":[]},"orig":{"faces":[]}}} diff --git a/scraper/tweet.go b/scraper/tweet.go index 29801a1..ffe92e7 100644 --- a/scraper/tweet.go +++ b/scraper/tweet.go @@ -92,11 +92,12 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { ret.Urls = append(ret.Urls, url.ExpandedURL) } for _, media := range apiTweet.Entities.Media { - if media.Type != "photo" { + if media.Type != "photo" { // TODO: remove this eventually panic_str := fmt.Sprintf("Unknown media type: %q", media.Type) panic(panic_str) } - new_image := Image{TweetID: ret.ID, Filename: media.MediaURLHttps, IsDownloaded: false} + new_image := ParseAPIMedia(media) + new_image.TweetID = ret.ID ret.Images = append(ret.Images, new_image) } for _, hashtag := range apiTweet.Entities.Hashtags {