Change method of scraping images

- break `Filename` field on Image into `RemoteURL` and `LocalFilename`
- new parsing function for an APIMedia item
This commit is contained in:
Alessio 2021-08-03 17:34:44 -07:00
parent 2663387f29
commit 494ca25dc4
6 changed files with 62 additions and 7 deletions

View File

@ -3,6 +3,8 @@
set -e set -e
set -x set -x
PS4='+(${BASH_SOURCE}:${LINENO}): '
test -e data && rm -r data test -e data && rm -r data
go run ./twitter create_profile data go run ./twitter create_profile data

View File

@ -14,6 +14,12 @@ func (v SortableVariants) Len() int { return len(v) }
func (v SortableVariants) Swap(i, j int) { v[i], v[j] = v[j], v[i] } func (v SortableVariants) Swap(i, j int) { v[i], v[j] = v[j], v[i] }
func (v SortableVariants) Less(i, j int) bool { return v[i].Bitrate > v[j].Bitrate } func (v SortableVariants) Less(i, j int) bool { return v[i].Bitrate > v[j].Bitrate }
type APIMedia struct {
MediaURLHttps string `json:"media_url_https"`
Type string `json:"type"`
URL string `json:"url"`
}
type APITweet struct { type APITweet struct {
ID string `json:"id_str"` ID string `json:"id_str"`
ConversationIDStr string `json:"conversation_id_str"` ConversationIDStr string `json:"conversation_id_str"`
@ -24,11 +30,7 @@ type APITweet struct {
Hashtags []struct { Hashtags []struct {
Text string `json:"text"` Text string `json:"text"`
} `json:"hashtags"` } `json:"hashtags"`
Media []struct { Media []APIMedia `json:"media"`
MediaURLHttps string `json:"media_url_https"`
Type string `json:"type"`
URL string `json:"url"`
} `json:"media"`
URLs []struct { URLs []struct {
ExpandedURL string `json:"expanded_url"` ExpandedURL string `json:"expanded_url"`
URL string `json:"url"` URL string `json:"url"`

View File

@ -10,9 +10,21 @@ type Image struct {
ID ImageID ID ImageID
TweetID TweetID TweetID TweetID
Filename string Filename string
RemoteURL string
LocalFilename string
IsDownloaded bool IsDownloaded bool
} }
func ParseAPIMedia(apiMedia APIMedia) Image {
local_filename := path.Base(apiMedia.MediaURLHttps)
return Image{
Filename: apiMedia.MediaURLHttps, // XXX filename
RemoteURL: apiMedia.MediaURLHttps,
LocalFilename: local_filename,
IsDownloaded: false,
}
}
func (img Image) FilenameWhenDownloaded() string { func (img Image) FilenameWhenDownloaded() string {
return path.Base(img.Filename) return path.Base(img.Filename)
} }

37
scraper/image_test.go Normal file
View File

@ -0,0 +1,37 @@
package scraper_test
import (
"testing"
"io/ioutil"
"encoding/json"
"offline_twitter/scraper"
)
func TestParseAPIMedia(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/image.json")
if err != nil {
panic(err)
}
var apimedia scraper.APIMedia
err = json.Unmarshal(data, &apimedia)
if err != nil {
t.Fatal(err.Error())
}
image := scraper.ParseAPIMedia(apimedia)
expected_remote_url := "https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg"
if image.RemoteURL != expected_remote_url {
t.Errorf("Expected %q, got %q", expected_remote_url, image.RemoteURL)
}
if image.Filename != expected_remote_url { // XXX filename: delete this check
t.Errorf("Expected %q, got %q", expected_remote_url, image.Filename)
}
expected_local_filename := "E18sEUrWYAk8dBl.jpg"
if image.LocalFilename != expected_local_filename {
t.Errorf("Expected %q, got %q", expected_local_filename, image.LocalFilename)
}
if image.IsDownloaded {
t.Errorf("Expected it not to be downloaded, but it was")
}
}

View File

@ -0,0 +1 @@
{"id_str":"1395882862289772553","indices":[27,50],"media_url":"http://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg","media_url_https":"https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg","url":"https://t.co/jSkwGsbKWv","display_url":"pic.twitter.com/jSkwGsbKWv","expanded_url":"https://twitter.com/michaelmalice/status/1395882872729477131/photo/1","type":"photo","original_info":{"width":593,"height":239,"focus_rects":[{"x":0,"y":0,"h":239,"w":427},{"x":14,"y":0,"h":239,"w":239},{"x":28,"y":0,"h":239,"w":210},{"x":73,"y":0,"h":239,"w":120},{"x":0,"y":0,"h":239,"w":593}]},"sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":593,"h":239,"resize":"fit"},"medium":{"w":593,"h":239,"resize":"fit"},"small":{"w":593,"h":239,"resize":"fit"}},"features":{"large":{"faces":[]},"medium":{"faces":[]},"small":{"faces":[]},"orig":{"faces":[]}}}

View File

@ -92,11 +92,12 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
ret.Urls = append(ret.Urls, url.ExpandedURL) ret.Urls = append(ret.Urls, url.ExpandedURL)
} }
for _, media := range apiTweet.Entities.Media { for _, media := range apiTweet.Entities.Media {
if media.Type != "photo" { if media.Type != "photo" { // TODO: remove this eventually
panic_str := fmt.Sprintf("Unknown media type: %q", media.Type) panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)
panic(panic_str) panic(panic_str)
} }
new_image := Image{TweetID: ret.ID, Filename: media.MediaURLHttps, IsDownloaded: false} new_image := ParseAPIMedia(media)
new_image.TweetID = ret.ID
ret.Images = append(ret.Images, new_image) ret.Images = append(ret.Images, new_image)
} }
for _, hashtag := range apiTweet.Entities.Hashtags { for _, hashtag := range apiTweet.Entities.Hashtags {