Change method of scraping images
- break `Filename` field on Image into `RemoteURL` and `LocalFilename` - new parsing function for an APIMedia item
This commit is contained in:
parent
2663387f29
commit
494ca25dc4
@ -3,6 +3,8 @@
|
|||||||
set -e
|
set -e
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
|
PS4='+(${BASH_SOURCE}:${LINENO}): '
|
||||||
|
|
||||||
test -e data && rm -r data
|
test -e data && rm -r data
|
||||||
|
|
||||||
go run ./twitter create_profile data
|
go run ./twitter create_profile data
|
||||||
|
@ -14,6 +14,12 @@ func (v SortableVariants) Len() int { return len(v) }
|
|||||||
func (v SortableVariants) Swap(i, j int) { v[i], v[j] = v[j], v[i] }
|
func (v SortableVariants) Swap(i, j int) { v[i], v[j] = v[j], v[i] }
|
||||||
func (v SortableVariants) Less(i, j int) bool { return v[i].Bitrate > v[j].Bitrate }
|
func (v SortableVariants) Less(i, j int) bool { return v[i].Bitrate > v[j].Bitrate }
|
||||||
|
|
||||||
|
type APIMedia struct {
|
||||||
|
MediaURLHttps string `json:"media_url_https"`
|
||||||
|
Type string `json:"type"`
|
||||||
|
URL string `json:"url"`
|
||||||
|
}
|
||||||
|
|
||||||
type APITweet struct {
|
type APITweet struct {
|
||||||
ID string `json:"id_str"`
|
ID string `json:"id_str"`
|
||||||
ConversationIDStr string `json:"conversation_id_str"`
|
ConversationIDStr string `json:"conversation_id_str"`
|
||||||
@ -24,11 +30,7 @@ type APITweet struct {
|
|||||||
Hashtags []struct {
|
Hashtags []struct {
|
||||||
Text string `json:"text"`
|
Text string `json:"text"`
|
||||||
} `json:"hashtags"`
|
} `json:"hashtags"`
|
||||||
Media []struct {
|
Media []APIMedia `json:"media"`
|
||||||
MediaURLHttps string `json:"media_url_https"`
|
|
||||||
Type string `json:"type"`
|
|
||||||
URL string `json:"url"`
|
|
||||||
} `json:"media"`
|
|
||||||
URLs []struct {
|
URLs []struct {
|
||||||
ExpandedURL string `json:"expanded_url"`
|
ExpandedURL string `json:"expanded_url"`
|
||||||
URL string `json:"url"`
|
URL string `json:"url"`
|
||||||
|
@ -10,9 +10,21 @@ type Image struct {
|
|||||||
ID ImageID
|
ID ImageID
|
||||||
TweetID TweetID
|
TweetID TweetID
|
||||||
Filename string
|
Filename string
|
||||||
|
RemoteURL string
|
||||||
|
LocalFilename string
|
||||||
IsDownloaded bool
|
IsDownloaded bool
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ParseAPIMedia(apiMedia APIMedia) Image {
|
||||||
|
local_filename := path.Base(apiMedia.MediaURLHttps)
|
||||||
|
return Image{
|
||||||
|
Filename: apiMedia.MediaURLHttps, // XXX filename
|
||||||
|
RemoteURL: apiMedia.MediaURLHttps,
|
||||||
|
LocalFilename: local_filename,
|
||||||
|
IsDownloaded: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (img Image) FilenameWhenDownloaded() string {
|
func (img Image) FilenameWhenDownloaded() string {
|
||||||
return path.Base(img.Filename)
|
return path.Base(img.Filename)
|
||||||
}
|
}
|
||||||
|
37
scraper/image_test.go
Normal file
37
scraper/image_test.go
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
package scraper_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"io/ioutil"
|
||||||
|
"encoding/json"
|
||||||
|
|
||||||
|
"offline_twitter/scraper"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseAPIMedia(t *testing.T) {
|
||||||
|
data, err := ioutil.ReadFile("test_responses/image.json")
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
var apimedia scraper.APIMedia
|
||||||
|
err = json.Unmarshal(data, &apimedia)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err.Error())
|
||||||
|
}
|
||||||
|
image := scraper.ParseAPIMedia(apimedia)
|
||||||
|
|
||||||
|
expected_remote_url := "https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg"
|
||||||
|
if image.RemoteURL != expected_remote_url {
|
||||||
|
t.Errorf("Expected %q, got %q", expected_remote_url, image.RemoteURL)
|
||||||
|
}
|
||||||
|
if image.Filename != expected_remote_url { // XXX filename: delete this check
|
||||||
|
t.Errorf("Expected %q, got %q", expected_remote_url, image.Filename)
|
||||||
|
}
|
||||||
|
expected_local_filename := "E18sEUrWYAk8dBl.jpg"
|
||||||
|
if image.LocalFilename != expected_local_filename {
|
||||||
|
t.Errorf("Expected %q, got %q", expected_local_filename, image.LocalFilename)
|
||||||
|
}
|
||||||
|
if image.IsDownloaded {
|
||||||
|
t.Errorf("Expected it not to be downloaded, but it was")
|
||||||
|
}
|
||||||
|
}
|
1
scraper/test_responses/image.json
Normal file
1
scraper/test_responses/image.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"id_str":"1395882862289772553","indices":[27,50],"media_url":"http://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg","media_url_https":"https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg","url":"https://t.co/jSkwGsbKWv","display_url":"pic.twitter.com/jSkwGsbKWv","expanded_url":"https://twitter.com/michaelmalice/status/1395882872729477131/photo/1","type":"photo","original_info":{"width":593,"height":239,"focus_rects":[{"x":0,"y":0,"h":239,"w":427},{"x":14,"y":0,"h":239,"w":239},{"x":28,"y":0,"h":239,"w":210},{"x":73,"y":0,"h":239,"w":120},{"x":0,"y":0,"h":239,"w":593}]},"sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":593,"h":239,"resize":"fit"},"medium":{"w":593,"h":239,"resize":"fit"},"small":{"w":593,"h":239,"resize":"fit"}},"features":{"large":{"faces":[]},"medium":{"faces":[]},"small":{"faces":[]},"orig":{"faces":[]}}}
|
@ -92,11 +92,12 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
|||||||
ret.Urls = append(ret.Urls, url.ExpandedURL)
|
ret.Urls = append(ret.Urls, url.ExpandedURL)
|
||||||
}
|
}
|
||||||
for _, media := range apiTweet.Entities.Media {
|
for _, media := range apiTweet.Entities.Media {
|
||||||
if media.Type != "photo" {
|
if media.Type != "photo" { // TODO: remove this eventually
|
||||||
panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)
|
panic_str := fmt.Sprintf("Unknown media type: %q", media.Type)
|
||||||
panic(panic_str)
|
panic(panic_str)
|
||||||
}
|
}
|
||||||
new_image := Image{TweetID: ret.ID, Filename: media.MediaURLHttps, IsDownloaded: false}
|
new_image := ParseAPIMedia(media)
|
||||||
|
new_image.TweetID = ret.ID
|
||||||
ret.Images = append(ret.Images, new_image)
|
ret.Images = append(ret.Images, new_image)
|
||||||
}
|
}
|
||||||
for _, hashtag := range apiTweet.Entities.Hashtags {
|
for _, hashtag := range apiTweet.Entities.Hashtags {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user