Change method of scraping images
- break `Filename` field on Image into `RemoteURL` and `LocalFilename` - new parsing function for an APIMedia item
This commit is contained in:
parent
2663387f29
commit
494ca25dc4
@ -3,6 +3,8 @@
|
||||
set -e
|
||||
set -x
|
||||
|
||||
PS4='+(${BASH_SOURCE}:${LINENO}): '
|
||||
|
||||
test -e data && rm -r data
|
||||
|
||||
go run ./twitter create_profile data
|
||||
|
@ -14,6 +14,12 @@ func (v SortableVariants) Len() int { return len(v) }
|
||||
func (v SortableVariants) Swap(i, j int) { v[i], v[j] = v[j], v[i] }
|
||||
func (v SortableVariants) Less(i, j int) bool { return v[i].Bitrate > v[j].Bitrate }
|
||||
|
||||
type APIMedia struct {
|
||||
MediaURLHttps string `json:"media_url_https"`
|
||||
Type string `json:"type"`
|
||||
URL string `json:"url"`
|
||||
}
|
||||
|
||||
type APITweet struct {
|
||||
ID string `json:"id_str"`
|
||||
ConversationIDStr string `json:"conversation_id_str"`
|
||||
@ -24,11 +30,7 @@ type APITweet struct {
|
||||
Hashtags []struct {
|
||||
Text string `json:"text"`
|
||||
} `json:"hashtags"`
|
||||
Media []struct {
|
||||
MediaURLHttps string `json:"media_url_https"`
|
||||
Type string `json:"type"`
|
||||
URL string `json:"url"`
|
||||
} `json:"media"`
|
||||
Media []APIMedia `json:"media"`
|
||||
URLs []struct {
|
||||
ExpandedURL string `json:"expanded_url"`
|
||||
URL string `json:"url"`
|
||||
|
@ -10,9 +10,21 @@ type Image struct {
|
||||
ID ImageID
|
||||
TweetID TweetID
|
||||
Filename string
|
||||
RemoteURL string
|
||||
LocalFilename string
|
||||
IsDownloaded bool
|
||||
}
|
||||
|
||||
func ParseAPIMedia(apiMedia APIMedia) Image {
|
||||
local_filename := path.Base(apiMedia.MediaURLHttps)
|
||||
return Image{
|
||||
Filename: apiMedia.MediaURLHttps, // XXX filename
|
||||
RemoteURL: apiMedia.MediaURLHttps,
|
||||
LocalFilename: local_filename,
|
||||
IsDownloaded: false,
|
||||
}
|
||||
}
|
||||
|
||||
func (img Image) FilenameWhenDownloaded() string {
|
||||
return path.Base(img.Filename)
|
||||
}
|
||||
|
37
scraper/image_test.go
Normal file
37
scraper/image_test.go
Normal file
@ -0,0 +1,37 @@
|
||||
package scraper_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"io/ioutil"
|
||||
"encoding/json"
|
||||
|
||||
"offline_twitter/scraper"
|
||||
)
|
||||
|
||||
func TestParseAPIMedia(t *testing.T) {
|
||||
data, err := ioutil.ReadFile("test_responses/image.json")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
var apimedia scraper.APIMedia
|
||||
err = json.Unmarshal(data, &apimedia)
|
||||
if err != nil {
|
||||
t.Fatal(err.Error())
|
||||
}
|
||||
image := scraper.ParseAPIMedia(apimedia)
|
||||
|
||||
expected_remote_url := "https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg"
|
||||
if image.RemoteURL != expected_remote_url {
|
||||
t.Errorf("Expected %q, got %q", expected_remote_url, image.RemoteURL)
|
||||
}
|
||||
if image.Filename != expected_remote_url { // XXX filename: delete this check
|
||||
t.Errorf("Expected %q, got %q", expected_remote_url, image.Filename)
|
||||
}
|
||||
expected_local_filename := "E18sEUrWYAk8dBl.jpg"
|
||||
if image.LocalFilename != expected_local_filename {
|
||||
t.Errorf("Expected %q, got %q", expected_local_filename, image.LocalFilename)
|
||||
}
|
||||
if image.IsDownloaded {
|
||||
t.Errorf("Expected it not to be downloaded, but it was")
|
||||
}
|
||||
}
|
1
scraper/test_responses/image.json
Normal file
1
scraper/test_responses/image.json
Normal file
@ -0,0 +1 @@
|
||||
{"id_str":"1395882862289772553","indices":[27,50],"media_url":"http://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg","media_url_https":"https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg","url":"https://t.co/jSkwGsbKWv","display_url":"pic.twitter.com/jSkwGsbKWv","expanded_url":"https://twitter.com/michaelmalice/status/1395882872729477131/photo/1","type":"photo","original_info":{"width":593,"height":239,"focus_rects":[{"x":0,"y":0,"h":239,"w":427},{"x":14,"y":0,"h":239,"w":239},{"x":28,"y":0,"h":239,"w":210},{"x":73,"y":0,"h":239,"w":120},{"x":0,"y":0,"h":239,"w":593}]},"sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":593,"h":239,"resize":"fit"},"medium":{"w":593,"h":239,"resize":"fit"},"small":{"w":593,"h":239,"resize":"fit"}},"features":{"large":{"faces":[]},"medium":{"faces":[]},"small":{"faces":[]},"orig":{"faces":[]}}}
|
@ -92,11 +92,12 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
||||
ret.Urls = append(ret.Urls, url.ExpandedURL)
|
||||
}
|
||||
for _, media := range apiTweet.Entities.Media {
|
||||
if media.Type != "photo" {
|
||||
if media.Type != "photo" { // TODO: remove this eventually
|
||||
panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)
|
||||
panic(panic_str)
|
||||
}
|
||||
new_image := Image{TweetID: ret.ID, Filename: media.MediaURLHttps, IsDownloaded: false}
|
||||
new_image := ParseAPIMedia(media)
|
||||
new_image.TweetID = ret.ID
|
||||
ret.Images = append(ret.Images, new_image)
|
||||
}
|
||||
for _, hashtag := range apiTweet.Entities.Hashtags {
|
||||
|
Loading…
x
Reference in New Issue
Block a user