Change method of scraping images

- break `Filename` field on Image into `RemoteURL` and `LocalFilename` - new parsing function for an APIMedia item
2021-08-03 17:34:44 -07:00 · 2021-08-03 17:34:44 -07:00 · 494ca25dc4
commit 494ca25dc4
parent 2663387f29
6 changed files with 62 additions and 7 deletions
--- a/cmd/tests.sh
+++ b/cmd/tests.sh
@ -3,6 +3,8 @@
 set -e
 set -x

+PS4='+(${BASH_SOURCE}:${LINENO}): '
+
 test -e data && rm -r data

 go run ./twitter create_profile data
--- a/scraper/api_types.go
+++ b/scraper/api_types.go
@ -14,6 +14,12 @@ func (v SortableVariants) Len() int { return len(v) }
 func (v SortableVariants) Swap(i, j int) { v[i], v[j] = v[j], v[i] }
 func (v SortableVariants) Less(i, j int) bool { return v[i].Bitrate > v[j].Bitrate }

+type APIMedia struct {
+	MediaURLHttps string `json:"media_url_https"`
+	Type          string `json:"type"`
+	URL           string `json:"url"`
+}
+
 type APITweet struct {
 	ID                string `json:"id_str"`
 	ConversationIDStr string `json:"conversation_id_str"`
@ -24,11 +30,7 @@ type APITweet struct {
 		Hashtags []struct {
 			Text string `json:"text"`
 		} `json:"hashtags"`
-		Media []struct {
-			MediaURLHttps string `json:"media_url_https"`
-			Type          string `json:"type"`
-			URL           string `json:"url"`
-		} `json:"media"`
+		Media []APIMedia `json:"media"`
 		URLs []struct {
 			ExpandedURL string `json:"expanded_url"`
 			URL         string `json:"url"`
--- a/scraper/image.go
+++ b/scraper/image.go
@ -10,9 +10,21 @@ type Image struct {
    ID ImageID
    TweetID TweetID
    Filename string
+    RemoteURL string
+    LocalFilename string
    IsDownloaded bool
 }

+func ParseAPIMedia(apiMedia APIMedia) Image {
+    local_filename := path.Base(apiMedia.MediaURLHttps)
+    return Image{
+        Filename: apiMedia.MediaURLHttps,  // XXX filename
+        RemoteURL: apiMedia.MediaURLHttps,
+        LocalFilename: local_filename,
+        IsDownloaded: false,
+    }
+}
+
 func (img Image) FilenameWhenDownloaded() string {
    return path.Base(img.Filename)
 }
--- a/scraper/image_test.go
+++ b/scraper/image_test.go
@ -0,0 +1,37 @@
+package scraper_test
+
+import (
+    "testing"
+    "io/ioutil"
+    "encoding/json"
+
+    "offline_twitter/scraper"
+)
+
+func TestParseAPIMedia(t *testing.T) {
+    data, err := ioutil.ReadFile("test_responses/image.json")
+    if err != nil {
+        panic(err)
+    }
+    var apimedia scraper.APIMedia
+    err = json.Unmarshal(data, &apimedia)
+    if err != nil {
+        t.Fatal(err.Error())
+    }
+    image := scraper.ParseAPIMedia(apimedia)
+
+    expected_remote_url := "https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg"
+    if image.RemoteURL != expected_remote_url {
+        t.Errorf("Expected %q, got %q", expected_remote_url, image.RemoteURL)
+    }
+    if image.Filename != expected_remote_url { // XXX filename: delete this check
+        t.Errorf("Expected %q, got %q", expected_remote_url, image.Filename)
+    }
+    expected_local_filename := "E18sEUrWYAk8dBl.jpg"
+    if image.LocalFilename != expected_local_filename {
+        t.Errorf("Expected %q, got %q", expected_local_filename, image.LocalFilename)
+    }
+    if image.IsDownloaded {
+        t.Errorf("Expected it not to be downloaded, but it was")
+    }
+}
--- a/scraper/test_responses/image.json
+++ b/scraper/test_responses/image.json
@ -0,0 +1 @@
+{"id_str":"1395882862289772553","indices":[27,50],"media_url":"http://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg","media_url_https":"https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg","url":"https://t.co/jSkwGsbKWv","display_url":"pic.twitter.com/jSkwGsbKWv","expanded_url":"https://twitter.com/michaelmalice/status/1395882872729477131/photo/1","type":"photo","original_info":{"width":593,"height":239,"focus_rects":[{"x":0,"y":0,"h":239,"w":427},{"x":14,"y":0,"h":239,"w":239},{"x":28,"y":0,"h":239,"w":210},{"x":73,"y":0,"h":239,"w":120},{"x":0,"y":0,"h":239,"w":593}]},"sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":593,"h":239,"resize":"fit"},"medium":{"w":593,"h":239,"resize":"fit"},"small":{"w":593,"h":239,"resize":"fit"}},"features":{"large":{"faces":[]},"medium":{"faces":[]},"small":{"faces":[]},"orig":{"faces":[]}}}
--- a/scraper/tweet.go
+++ b/scraper/tweet.go
@ -92,11 +92,12 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
 		ret.Urls = append(ret.Urls, url.ExpandedURL)
 	}
 	for _, media := range apiTweet.Entities.Media {
-		if media.Type != "photo" {
+		if media.Type != "photo" {  // TODO: remove this eventually
 			panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)
 			panic(panic_str)
 		}
-		new_image := Image{TweetID: ret.ID, Filename: media.MediaURLHttps, IsDownloaded: false}
+		new_image := ParseAPIMedia(media)
+		new_image.TweetID = ret.ID
 		ret.Images = append(ret.Images, new_image)
 	}
 	for _, hashtag := range apiTweet.Entities.Hashtags {
				`@ -0,0 +1 @@`
				{"id_str":"1395882862289772553","indices":[27,50],"media_url":"http://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg","media_url_https":"https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg","url":"https://t.co/jSkwGsbKWv","display_url":"pic.twitter.com/jSkwGsbKWv","expanded_url":"https://twitter.com/michaelmalice/status/1395882872729477131/photo/1","type":"photo","original_info":{"width":593,"height":239,"focus_rects":[{"x":0,"y":0,"h":239,"w":427},{"x":14,"y":0,"h":239,"w":239},{"x":28,"y":0,"h":239,"w":210},{"x":73,"y":0,"h":239,"w":120},{"x":0,"y":0,"h":239,"w":593}]},"sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":593,"h":239,"resize":"fit"},"medium":{"w":593,"h":239,"resize":"fit"},"small":{"w":593,"h":239,"resize":"fit"}},"features":{"large":{"faces":[]},"medium":{"faces":[]},"small":{"faces":[]},"orig":{"faces":[]}}}