Add handling for URL cards with no link preview thumbnail image
This commit is contained in:
parent
9d10fd5942
commit
0bb9ff6c6b
15
cmd/tests.sh
15
cmd/tests.sh
@ -130,14 +130,25 @@ test $urls_count_after_2x = $urls_count_after
|
||||
# Download the link's preview image
|
||||
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1428904664645394433") = "0"
|
||||
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1428904664645394433") = "0"
|
||||
test $(find link_preview_images/* | wc -l) = "0"
|
||||
test $(find link_preview_images | wc -l) = "1"
|
||||
tw download_tweet_content 1428904664645394433
|
||||
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1428904664645394433") = "1"
|
||||
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1428904664645394433") = "1"
|
||||
test $(find link_preview_images/* | wc -l) = "1"
|
||||
test $(find link_preview_images | wc -l) = "2"
|
||||
test -f link_preview_images/WX1Rv2AJ_800x320_1.jpg
|
||||
|
||||
|
||||
# Test a tweet with a URL but no thumbnail
|
||||
tw fetch_tweet https://twitter.com/Xirong7/status/1413665734866186243
|
||||
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1413665734866186243") = "0"
|
||||
test $(sqlite3 twitter.db "select has_thumbnail from urls where tweet_id = 1413665734866186243") = "0"
|
||||
test $(find link_preview_images | wc -l) = "2"
|
||||
tw download_tweet_content 1413665734866186243
|
||||
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1413665734866186243") = "1"
|
||||
test $(find link_preview_images | wc -l) = "2"
|
||||
|
||||
|
||||
|
||||
# TODO: Maybe this file should be broken up into multiple test scripts
|
||||
|
||||
echo -e "\033[32mAll tests passed. Finished successfully.\033[0m"
|
||||
|
@ -76,7 +76,7 @@ func (p Profile) download_tweet_video(v *scraper.Video, downloader MediaDownload
|
||||
* Downloads an URL thumbnail image, and if successful, marks it as downloaded in the DB
|
||||
*/
|
||||
func (p Profile) download_link_thumbnail(url *scraper.Url, downloader MediaDownloader) error {
|
||||
if url.HasCard {
|
||||
if url.HasCard && url.HasThumbnail {
|
||||
outfile := path.Join(p.ProfileDir, "link_preview_images", url.ThumbnailLocalPath)
|
||||
err := downloader.Curl(url.ThumbnailRemoteUrl, outfile)
|
||||
if err != nil {
|
||||
|
@ -47,12 +47,12 @@ func (p Profile) SaveVideo(vid scraper.Video) error {
|
||||
*/
|
||||
func (p Profile) SaveUrl(url scraper.Url) error {
|
||||
_, err := p.DB.Exec(`
|
||||
insert into urls (tweet_id, domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded)
|
||||
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
insert into urls (tweet_id, domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, has_thumbnail, is_content_downloaded)
|
||||
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
on conflict do update
|
||||
set is_content_downloaded=?
|
||||
`,
|
||||
url.TweetID, url.Domain, url.Text, url.Title, url.Description, url.CreatorID, url.SiteID, url.ThumbnailRemoteUrl, url.ThumbnailLocalPath, url.HasCard, url.IsContentDownloaded,
|
||||
url.TweetID, url.Domain, url.Text, url.Title, url.Description, url.CreatorID, url.SiteID, url.ThumbnailRemoteUrl, url.ThumbnailLocalPath, url.HasCard, url.HasThumbnail, url.IsContentDownloaded,
|
||||
url.IsContentDownloaded,
|
||||
)
|
||||
return err
|
||||
@ -114,7 +114,7 @@ func (p Profile) GetVideosForTweet(t scraper.Tweet) (vids []scraper.Video, err e
|
||||
* Get the list of Urls for a Tweet
|
||||
*/
|
||||
func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error) {
|
||||
stmt, err := p.DB.Prepare("select domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded from urls where tweet_id=? order by rowid")
|
||||
stmt, err := p.DB.Prepare("select domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, has_thumbnail, is_content_downloaded from urls where tweet_id=? order by rowid")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
@ -125,7 +125,7 @@ func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error
|
||||
}
|
||||
var url scraper.Url
|
||||
for rows.Next() {
|
||||
err = rows.Scan(&url.Domain, &url.Text, &url.Title, &url.Description, &url.CreatorID, &url.SiteID, &url.ThumbnailRemoteUrl, &url.ThumbnailLocalPath, &url.HasCard, &url.IsContentDownloaded)
|
||||
err = rows.Scan(&url.Domain, &url.Text, &url.Title, &url.Description, &url.CreatorID, &url.SiteID, &url.ThumbnailRemoteUrl, &url.ThumbnailLocalPath, &url.HasCard, &url.HasThumbnail, &url.IsContentDownloaded)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
@ -63,6 +63,7 @@ create table urls (rowid integer primary key,
|
||||
thumbnail_remote_url text,
|
||||
thumbnail_local_path text,
|
||||
has_card boolean,
|
||||
has_thumbnail boolean,
|
||||
is_content_downloaded boolean default 0,
|
||||
|
||||
unique (tweet_id, text)
|
||||
|
1
scraper/test_responses/url_card_without_thumbnail.json
Normal file
1
scraper/test_responses/url_card_without_thumbnail.json
Normal file
@ -0,0 +1 @@
|
||||
{"name":"summary","url":"https://t.co/BfcswDBBtl","card_type_url":"http://card-type-url-is-deprecated.invalid","binding_values":{"vanity_url":{"type":"STRING","string_value":"en.m.wikipedia.org","scribe_key":"vanity_url"},"domain":{"type":"STRING","string_value":"en.m.wikipedia.org"},"title":{"type":"STRING","string_value":"Entryism - Wikipedia"},"card_url":{"type":"STRING","string_value":"https://t.co/BfcswDBBtl","scribe_key":"card_url"}},"card_platform":{"platform":{"device":{"name":"Swift","version":"12"},"audience":{"name":"production"}}}}
|
@ -19,38 +19,39 @@ type Url struct {
|
||||
SiteID UserID
|
||||
|
||||
HasCard bool
|
||||
HasThumbnail bool
|
||||
IsContentDownloaded bool
|
||||
}
|
||||
|
||||
func ParseAPIUrlCard(apiCard APICard) Url {
|
||||
values := apiCard.BindingValues
|
||||
ret := Url{}
|
||||
ret.HasCard = true
|
||||
|
||||
ret.Domain = values.Domain.Value
|
||||
ret.Title = values.Title.Value
|
||||
ret.Description = values.Description.Value
|
||||
ret.IsContentDownloaded = false
|
||||
ret.CreatorID = UserID(values.Creator.UserValue.Value)
|
||||
ret.SiteID = UserID(values.Site.UserValue.Value)
|
||||
|
||||
var thumbnail_url string
|
||||
|
||||
if apiCard.Name == "summary_large_image" || apiCard.Name == "summary" {
|
||||
return Url{
|
||||
Domain: values.Domain.Value,
|
||||
Title: values.Title.Value,
|
||||
Description: values.Description.Value,
|
||||
ThumbnailRemoteUrl: values.Thumbnail.ImageValue.Url,
|
||||
ThumbnailLocalPath: get_thumbnail_local_path(values.Thumbnail.ImageValue.Url),
|
||||
CreatorID: UserID(values.Creator.UserValue.Value),
|
||||
SiteID: UserID(values.Site.UserValue.Value),
|
||||
HasCard: true,
|
||||
IsContentDownloaded: false,
|
||||
}
|
||||
thumbnail_url = values.Thumbnail.ImageValue.Url
|
||||
} else if apiCard.Name == "player" {
|
||||
return Url{
|
||||
Domain: values.Domain.Value,
|
||||
Title: values.Title.Value,
|
||||
Description: values.Description.Value,
|
||||
ThumbnailRemoteUrl: values.PlayerImage.ImageValue.Url,
|
||||
ThumbnailLocalPath: get_thumbnail_local_path(values.PlayerImage.ImageValue.Url),
|
||||
CreatorID: UserID(values.Creator.UserValue.Value),
|
||||
SiteID: UserID(values.Site.UserValue.Value),
|
||||
HasCard: true,
|
||||
IsContentDownloaded: false,
|
||||
}
|
||||
thumbnail_url = values.PlayerImage.ImageValue.Url
|
||||
} else {
|
||||
panic("Unknown card type: " + apiCard.Name)
|
||||
}
|
||||
|
||||
if thumbnail_url != "" {
|
||||
ret.HasThumbnail = true
|
||||
ret.ThumbnailRemoteUrl = thumbnail_url
|
||||
ret.ThumbnailLocalPath = get_thumbnail_local_path(thumbnail_url)
|
||||
}
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
func get_thumbnail_local_path(remote_url string) string {
|
||||
|
@ -48,6 +48,9 @@ func TestParseAPIUrlCard(t *testing.T) {
|
||||
if url.SiteID != expected_site_id {
|
||||
t.Errorf("Expected %d, got %d", expected_site_id, url.SiteID)
|
||||
}
|
||||
if !url.HasThumbnail {
|
||||
t.Errorf("Should have a thumbnail, but it doesn't")
|
||||
}
|
||||
if url.IsContentDownloaded {
|
||||
t.Errorf("Expected it not to be downloaded, but it was")
|
||||
}
|
||||
@ -89,7 +92,43 @@ func TestParseAPIUrlCardWithPlayer(t *testing.T) {
|
||||
if url.SiteID != expected_site_id {
|
||||
t.Errorf("Expected %d, got %d", expected_site_id, url.SiteID)
|
||||
}
|
||||
if !url.HasThumbnail {
|
||||
t.Errorf("Should have a thumbnail, but it doesn't")
|
||||
}
|
||||
if url.IsContentDownloaded {
|
||||
t.Errorf("Expected it not to be downloaded, but it was")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseAPIUrlCardWithoutThumbnail(t *testing.T) {
|
||||
data, err := ioutil.ReadFile("test_responses/url_card_without_thumbnail.json")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
var apiCard scraper.APICard
|
||||
err = json.Unmarshal(data, &apiCard)
|
||||
if err != nil {
|
||||
t.Fatal(err.Error())
|
||||
}
|
||||
url := scraper.ParseAPIUrlCard(apiCard)
|
||||
|
||||
expected_domain := "en.m.wikipedia.org"
|
||||
if url.Domain != expected_domain {
|
||||
t.Errorf("Expected %q, got %q", expected_domain, url.Domain)
|
||||
}
|
||||
expected_title := "Entryism - Wikipedia"
|
||||
if url.Title != expected_title {
|
||||
t.Errorf("Expected %q, got %q", expected_title, url.Title)
|
||||
}
|
||||
expected_description := ""
|
||||
if url.Description != expected_description {
|
||||
t.Errorf("Expected %q, got %q", expected_description, url.Description)
|
||||
}
|
||||
|
||||
if !url.HasCard {
|
||||
t.Errorf("Expected it to have a card, but it didn't")
|
||||
}
|
||||
if url.HasThumbnail {
|
||||
t.Errorf("Should have no thumbnail, but it does")
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user