Add handling for URL cards with no link preview thumbnail image
This commit is contained in:
parent
9d10fd5942
commit
0bb9ff6c6b
15
cmd/tests.sh
15
cmd/tests.sh
@ -130,14 +130,25 @@ test $urls_count_after_2x = $urls_count_after
|
|||||||
# Download the link's preview image
|
# Download the link's preview image
|
||||||
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1428904664645394433") = "0"
|
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1428904664645394433") = "0"
|
||||||
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1428904664645394433") = "0"
|
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1428904664645394433") = "0"
|
||||||
test $(find link_preview_images/* | wc -l) = "0"
|
test $(find link_preview_images | wc -l) = "1"
|
||||||
tw download_tweet_content 1428904664645394433
|
tw download_tweet_content 1428904664645394433
|
||||||
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1428904664645394433") = "1"
|
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1428904664645394433") = "1"
|
||||||
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1428904664645394433") = "1"
|
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1428904664645394433") = "1"
|
||||||
test $(find link_preview_images/* | wc -l) = "1"
|
test $(find link_preview_images | wc -l) = "2"
|
||||||
test -f link_preview_images/WX1Rv2AJ_800x320_1.jpg
|
test -f link_preview_images/WX1Rv2AJ_800x320_1.jpg
|
||||||
|
|
||||||
|
|
||||||
|
# Test a tweet with a URL but no thumbnail
|
||||||
|
tw fetch_tweet https://twitter.com/Xirong7/status/1413665734866186243
|
||||||
|
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1413665734866186243") = "0"
|
||||||
|
test $(sqlite3 twitter.db "select has_thumbnail from urls where tweet_id = 1413665734866186243") = "0"
|
||||||
|
test $(find link_preview_images | wc -l) = "2"
|
||||||
|
tw download_tweet_content 1413665734866186243
|
||||||
|
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1413665734866186243") = "1"
|
||||||
|
test $(find link_preview_images | wc -l) = "2"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# TODO: Maybe this file should be broken up into multiple test scripts
|
# TODO: Maybe this file should be broken up into multiple test scripts
|
||||||
|
|
||||||
echo -e "\033[32mAll tests passed. Finished successfully.\033[0m"
|
echo -e "\033[32mAll tests passed. Finished successfully.\033[0m"
|
||||||
|
@ -76,7 +76,7 @@ func (p Profile) download_tweet_video(v *scraper.Video, downloader MediaDownload
|
|||||||
* Downloads an URL thumbnail image, and if successful, marks it as downloaded in the DB
|
* Downloads an URL thumbnail image, and if successful, marks it as downloaded in the DB
|
||||||
*/
|
*/
|
||||||
func (p Profile) download_link_thumbnail(url *scraper.Url, downloader MediaDownloader) error {
|
func (p Profile) download_link_thumbnail(url *scraper.Url, downloader MediaDownloader) error {
|
||||||
if url.HasCard {
|
if url.HasCard && url.HasThumbnail {
|
||||||
outfile := path.Join(p.ProfileDir, "link_preview_images", url.ThumbnailLocalPath)
|
outfile := path.Join(p.ProfileDir, "link_preview_images", url.ThumbnailLocalPath)
|
||||||
err := downloader.Curl(url.ThumbnailRemoteUrl, outfile)
|
err := downloader.Curl(url.ThumbnailRemoteUrl, outfile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -47,12 +47,12 @@ func (p Profile) SaveVideo(vid scraper.Video) error {
|
|||||||
*/
|
*/
|
||||||
func (p Profile) SaveUrl(url scraper.Url) error {
|
func (p Profile) SaveUrl(url scraper.Url) error {
|
||||||
_, err := p.DB.Exec(`
|
_, err := p.DB.Exec(`
|
||||||
insert into urls (tweet_id, domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded)
|
insert into urls (tweet_id, domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, has_thumbnail, is_content_downloaded)
|
||||||
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
on conflict do update
|
on conflict do update
|
||||||
set is_content_downloaded=?
|
set is_content_downloaded=?
|
||||||
`,
|
`,
|
||||||
url.TweetID, url.Domain, url.Text, url.Title, url.Description, url.CreatorID, url.SiteID, url.ThumbnailRemoteUrl, url.ThumbnailLocalPath, url.HasCard, url.IsContentDownloaded,
|
url.TweetID, url.Domain, url.Text, url.Title, url.Description, url.CreatorID, url.SiteID, url.ThumbnailRemoteUrl, url.ThumbnailLocalPath, url.HasCard, url.HasThumbnail, url.IsContentDownloaded,
|
||||||
url.IsContentDownloaded,
|
url.IsContentDownloaded,
|
||||||
)
|
)
|
||||||
return err
|
return err
|
||||||
@ -114,7 +114,7 @@ func (p Profile) GetVideosForTweet(t scraper.Tweet) (vids []scraper.Video, err e
|
|||||||
* Get the list of Urls for a Tweet
|
* Get the list of Urls for a Tweet
|
||||||
*/
|
*/
|
||||||
func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error) {
|
func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error) {
|
||||||
stmt, err := p.DB.Prepare("select domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded from urls where tweet_id=? order by rowid")
|
stmt, err := p.DB.Prepare("select domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, has_thumbnail, is_content_downloaded from urls where tweet_id=? order by rowid")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@ -125,7 +125,7 @@ func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error
|
|||||||
}
|
}
|
||||||
var url scraper.Url
|
var url scraper.Url
|
||||||
for rows.Next() {
|
for rows.Next() {
|
||||||
err = rows.Scan(&url.Domain, &url.Text, &url.Title, &url.Description, &url.CreatorID, &url.SiteID, &url.ThumbnailRemoteUrl, &url.ThumbnailLocalPath, &url.HasCard, &url.IsContentDownloaded)
|
err = rows.Scan(&url.Domain, &url.Text, &url.Title, &url.Description, &url.CreatorID, &url.SiteID, &url.ThumbnailRemoteUrl, &url.ThumbnailLocalPath, &url.HasCard, &url.HasThumbnail, &url.IsContentDownloaded)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
@ -63,6 +63,7 @@ create table urls (rowid integer primary key,
|
|||||||
thumbnail_remote_url text,
|
thumbnail_remote_url text,
|
||||||
thumbnail_local_path text,
|
thumbnail_local_path text,
|
||||||
has_card boolean,
|
has_card boolean,
|
||||||
|
has_thumbnail boolean,
|
||||||
is_content_downloaded boolean default 0,
|
is_content_downloaded boolean default 0,
|
||||||
|
|
||||||
unique (tweet_id, text)
|
unique (tweet_id, text)
|
||||||
|
1
scraper/test_responses/url_card_without_thumbnail.json
Normal file
1
scraper/test_responses/url_card_without_thumbnail.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"name":"summary","url":"https://t.co/BfcswDBBtl","card_type_url":"http://card-type-url-is-deprecated.invalid","binding_values":{"vanity_url":{"type":"STRING","string_value":"en.m.wikipedia.org","scribe_key":"vanity_url"},"domain":{"type":"STRING","string_value":"en.m.wikipedia.org"},"title":{"type":"STRING","string_value":"Entryism - Wikipedia"},"card_url":{"type":"STRING","string_value":"https://t.co/BfcswDBBtl","scribe_key":"card_url"}},"card_platform":{"platform":{"device":{"name":"Swift","version":"12"},"audience":{"name":"production"}}}}
|
@ -19,38 +19,39 @@ type Url struct {
|
|||||||
SiteID UserID
|
SiteID UserID
|
||||||
|
|
||||||
HasCard bool
|
HasCard bool
|
||||||
|
HasThumbnail bool
|
||||||
IsContentDownloaded bool
|
IsContentDownloaded bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func ParseAPIUrlCard(apiCard APICard) Url {
|
func ParseAPIUrlCard(apiCard APICard) Url {
|
||||||
values := apiCard.BindingValues
|
values := apiCard.BindingValues
|
||||||
|
ret := Url{}
|
||||||
|
ret.HasCard = true
|
||||||
|
|
||||||
|
ret.Domain = values.Domain.Value
|
||||||
|
ret.Title = values.Title.Value
|
||||||
|
ret.Description = values.Description.Value
|
||||||
|
ret.IsContentDownloaded = false
|
||||||
|
ret.CreatorID = UserID(values.Creator.UserValue.Value)
|
||||||
|
ret.SiteID = UserID(values.Site.UserValue.Value)
|
||||||
|
|
||||||
|
var thumbnail_url string
|
||||||
|
|
||||||
if apiCard.Name == "summary_large_image" || apiCard.Name == "summary" {
|
if apiCard.Name == "summary_large_image" || apiCard.Name == "summary" {
|
||||||
return Url{
|
thumbnail_url = values.Thumbnail.ImageValue.Url
|
||||||
Domain: values.Domain.Value,
|
|
||||||
Title: values.Title.Value,
|
|
||||||
Description: values.Description.Value,
|
|
||||||
ThumbnailRemoteUrl: values.Thumbnail.ImageValue.Url,
|
|
||||||
ThumbnailLocalPath: get_thumbnail_local_path(values.Thumbnail.ImageValue.Url),
|
|
||||||
CreatorID: UserID(values.Creator.UserValue.Value),
|
|
||||||
SiteID: UserID(values.Site.UserValue.Value),
|
|
||||||
HasCard: true,
|
|
||||||
IsContentDownloaded: false,
|
|
||||||
}
|
|
||||||
} else if apiCard.Name == "player" {
|
} else if apiCard.Name == "player" {
|
||||||
return Url{
|
thumbnail_url = values.PlayerImage.ImageValue.Url
|
||||||
Domain: values.Domain.Value,
|
|
||||||
Title: values.Title.Value,
|
|
||||||
Description: values.Description.Value,
|
|
||||||
ThumbnailRemoteUrl: values.PlayerImage.ImageValue.Url,
|
|
||||||
ThumbnailLocalPath: get_thumbnail_local_path(values.PlayerImage.ImageValue.Url),
|
|
||||||
CreatorID: UserID(values.Creator.UserValue.Value),
|
|
||||||
SiteID: UserID(values.Site.UserValue.Value),
|
|
||||||
HasCard: true,
|
|
||||||
IsContentDownloaded: false,
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
panic("Unknown card type: " + apiCard.Name)
|
panic("Unknown card type: " + apiCard.Name)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if thumbnail_url != "" {
|
||||||
|
ret.HasThumbnail = true
|
||||||
|
ret.ThumbnailRemoteUrl = thumbnail_url
|
||||||
|
ret.ThumbnailLocalPath = get_thumbnail_local_path(thumbnail_url)
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret
|
||||||
}
|
}
|
||||||
|
|
||||||
func get_thumbnail_local_path(remote_url string) string {
|
func get_thumbnail_local_path(remote_url string) string {
|
||||||
|
@ -48,6 +48,9 @@ func TestParseAPIUrlCard(t *testing.T) {
|
|||||||
if url.SiteID != expected_site_id {
|
if url.SiteID != expected_site_id {
|
||||||
t.Errorf("Expected %d, got %d", expected_site_id, url.SiteID)
|
t.Errorf("Expected %d, got %d", expected_site_id, url.SiteID)
|
||||||
}
|
}
|
||||||
|
if !url.HasThumbnail {
|
||||||
|
t.Errorf("Should have a thumbnail, but it doesn't")
|
||||||
|
}
|
||||||
if url.IsContentDownloaded {
|
if url.IsContentDownloaded {
|
||||||
t.Errorf("Expected it not to be downloaded, but it was")
|
t.Errorf("Expected it not to be downloaded, but it was")
|
||||||
}
|
}
|
||||||
@ -89,7 +92,43 @@ func TestParseAPIUrlCardWithPlayer(t *testing.T) {
|
|||||||
if url.SiteID != expected_site_id {
|
if url.SiteID != expected_site_id {
|
||||||
t.Errorf("Expected %d, got %d", expected_site_id, url.SiteID)
|
t.Errorf("Expected %d, got %d", expected_site_id, url.SiteID)
|
||||||
}
|
}
|
||||||
|
if !url.HasThumbnail {
|
||||||
|
t.Errorf("Should have a thumbnail, but it doesn't")
|
||||||
|
}
|
||||||
if url.IsContentDownloaded {
|
if url.IsContentDownloaded {
|
||||||
t.Errorf("Expected it not to be downloaded, but it was")
|
t.Errorf("Expected it not to be downloaded, but it was")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseAPIUrlCardWithoutThumbnail(t *testing.T) {
|
||||||
|
data, err := ioutil.ReadFile("test_responses/url_card_without_thumbnail.json")
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
var apiCard scraper.APICard
|
||||||
|
err = json.Unmarshal(data, &apiCard)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err.Error())
|
||||||
|
}
|
||||||
|
url := scraper.ParseAPIUrlCard(apiCard)
|
||||||
|
|
||||||
|
expected_domain := "en.m.wikipedia.org"
|
||||||
|
if url.Domain != expected_domain {
|
||||||
|
t.Errorf("Expected %q, got %q", expected_domain, url.Domain)
|
||||||
|
}
|
||||||
|
expected_title := "Entryism - Wikipedia"
|
||||||
|
if url.Title != expected_title {
|
||||||
|
t.Errorf("Expected %q, got %q", expected_title, url.Title)
|
||||||
|
}
|
||||||
|
expected_description := ""
|
||||||
|
if url.Description != expected_description {
|
||||||
|
t.Errorf("Expected %q, got %q", expected_description, url.Description)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !url.HasCard {
|
||||||
|
t.Errorf("Expected it to have a card, but it didn't")
|
||||||
|
}
|
||||||
|
if url.HasThumbnail {
|
||||||
|
t.Errorf("Should have no thumbnail, but it does")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user