Add handling for URL cards with no link preview thumbnail image

This commit is contained in:
Alessio 2021-09-17 20:50:28 -07:00
parent 9d10fd5942
commit 0bb9ff6c6b
7 changed files with 83 additions and 30 deletions

View File

@ -130,14 +130,25 @@ test $urls_count_after_2x = $urls_count_after
# Download the link's preview image # Download the link's preview image
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1428904664645394433") = "0" test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1428904664645394433") = "0"
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1428904664645394433") = "0" test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1428904664645394433") = "0"
test $(find link_preview_images/* | wc -l) = "0" test $(find link_preview_images | wc -l) = "1"
tw download_tweet_content 1428904664645394433 tw download_tweet_content 1428904664645394433
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1428904664645394433") = "1" test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1428904664645394433") = "1"
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1428904664645394433") = "1" test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1428904664645394433") = "1"
test $(find link_preview_images/* | wc -l) = "1" test $(find link_preview_images | wc -l) = "2"
test -f link_preview_images/WX1Rv2AJ_800x320_1.jpg test -f link_preview_images/WX1Rv2AJ_800x320_1.jpg
# Test a tweet with a URL but no thumbnail
tw fetch_tweet https://twitter.com/Xirong7/status/1413665734866186243
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1413665734866186243") = "0"
test $(sqlite3 twitter.db "select has_thumbnail from urls where tweet_id = 1413665734866186243") = "0"
test $(find link_preview_images | wc -l) = "2"
tw download_tweet_content 1413665734866186243
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1413665734866186243") = "1"
test $(find link_preview_images | wc -l) = "2"
# TODO: Maybe this file should be broken up into multiple test scripts # TODO: Maybe this file should be broken up into multiple test scripts
echo -e "\033[32mAll tests passed. Finished successfully.\033[0m" echo -e "\033[32mAll tests passed. Finished successfully.\033[0m"

View File

@ -76,7 +76,7 @@ func (p Profile) download_tweet_video(v *scraper.Video, downloader MediaDownload
* Downloads an URL thumbnail image, and if successful, marks it as downloaded in the DB * Downloads an URL thumbnail image, and if successful, marks it as downloaded in the DB
*/ */
func (p Profile) download_link_thumbnail(url *scraper.Url, downloader MediaDownloader) error { func (p Profile) download_link_thumbnail(url *scraper.Url, downloader MediaDownloader) error {
if url.HasCard { if url.HasCard && url.HasThumbnail {
outfile := path.Join(p.ProfileDir, "link_preview_images", url.ThumbnailLocalPath) outfile := path.Join(p.ProfileDir, "link_preview_images", url.ThumbnailLocalPath)
err := downloader.Curl(url.ThumbnailRemoteUrl, outfile) err := downloader.Curl(url.ThumbnailRemoteUrl, outfile)
if err != nil { if err != nil {

View File

@ -47,12 +47,12 @@ func (p Profile) SaveVideo(vid scraper.Video) error {
*/ */
func (p Profile) SaveUrl(url scraper.Url) error { func (p Profile) SaveUrl(url scraper.Url) error {
_, err := p.DB.Exec(` _, err := p.DB.Exec(`
insert into urls (tweet_id, domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded) insert into urls (tweet_id, domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, has_thumbnail, is_content_downloaded)
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
on conflict do update on conflict do update
set is_content_downloaded=? set is_content_downloaded=?
`, `,
url.TweetID, url.Domain, url.Text, url.Title, url.Description, url.CreatorID, url.SiteID, url.ThumbnailRemoteUrl, url.ThumbnailLocalPath, url.HasCard, url.IsContentDownloaded, url.TweetID, url.Domain, url.Text, url.Title, url.Description, url.CreatorID, url.SiteID, url.ThumbnailRemoteUrl, url.ThumbnailLocalPath, url.HasCard, url.HasThumbnail, url.IsContentDownloaded,
url.IsContentDownloaded, url.IsContentDownloaded,
) )
return err return err
@ -114,7 +114,7 @@ func (p Profile) GetVideosForTweet(t scraper.Tweet) (vids []scraper.Video, err e
* Get the list of Urls for a Tweet * Get the list of Urls for a Tweet
*/ */
func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error) { func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error) {
stmt, err := p.DB.Prepare("select domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded from urls where tweet_id=? order by rowid") stmt, err := p.DB.Prepare("select domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, has_thumbnail, is_content_downloaded from urls where tweet_id=? order by rowid")
if err != nil { if err != nil {
return return
} }
@ -125,7 +125,7 @@ func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error
} }
var url scraper.Url var url scraper.Url
for rows.Next() { for rows.Next() {
err = rows.Scan(&url.Domain, &url.Text, &url.Title, &url.Description, &url.CreatorID, &url.SiteID, &url.ThumbnailRemoteUrl, &url.ThumbnailLocalPath, &url.HasCard, &url.IsContentDownloaded) err = rows.Scan(&url.Domain, &url.Text, &url.Title, &url.Description, &url.CreatorID, &url.SiteID, &url.ThumbnailRemoteUrl, &url.ThumbnailLocalPath, &url.HasCard, &url.HasThumbnail, &url.IsContentDownloaded)
if err != nil { if err != nil {
return return
} }

View File

@ -63,6 +63,7 @@ create table urls (rowid integer primary key,
thumbnail_remote_url text, thumbnail_remote_url text,
thumbnail_local_path text, thumbnail_local_path text,
has_card boolean, has_card boolean,
has_thumbnail boolean,
is_content_downloaded boolean default 0, is_content_downloaded boolean default 0,
unique (tweet_id, text) unique (tweet_id, text)

View File

@ -0,0 +1 @@
{"name":"summary","url":"https://t.co/BfcswDBBtl","card_type_url":"http://card-type-url-is-deprecated.invalid","binding_values":{"vanity_url":{"type":"STRING","string_value":"en.m.wikipedia.org","scribe_key":"vanity_url"},"domain":{"type":"STRING","string_value":"en.m.wikipedia.org"},"title":{"type":"STRING","string_value":"Entryism - Wikipedia"},"card_url":{"type":"STRING","string_value":"https://t.co/BfcswDBBtl","scribe_key":"card_url"}},"card_platform":{"platform":{"device":{"name":"Swift","version":"12"},"audience":{"name":"production"}}}}

View File

@ -19,38 +19,39 @@ type Url struct {
SiteID UserID SiteID UserID
HasCard bool HasCard bool
HasThumbnail bool
IsContentDownloaded bool IsContentDownloaded bool
} }
func ParseAPIUrlCard(apiCard APICard) Url { func ParseAPIUrlCard(apiCard APICard) Url {
values := apiCard.BindingValues values := apiCard.BindingValues
ret := Url{}
ret.HasCard = true
ret.Domain = values.Domain.Value
ret.Title = values.Title.Value
ret.Description = values.Description.Value
ret.IsContentDownloaded = false
ret.CreatorID = UserID(values.Creator.UserValue.Value)
ret.SiteID = UserID(values.Site.UserValue.Value)
var thumbnail_url string
if apiCard.Name == "summary_large_image" || apiCard.Name == "summary" { if apiCard.Name == "summary_large_image" || apiCard.Name == "summary" {
return Url{ thumbnail_url = values.Thumbnail.ImageValue.Url
Domain: values.Domain.Value,
Title: values.Title.Value,
Description: values.Description.Value,
ThumbnailRemoteUrl: values.Thumbnail.ImageValue.Url,
ThumbnailLocalPath: get_thumbnail_local_path(values.Thumbnail.ImageValue.Url),
CreatorID: UserID(values.Creator.UserValue.Value),
SiteID: UserID(values.Site.UserValue.Value),
HasCard: true,
IsContentDownloaded: false,
}
} else if apiCard.Name == "player" { } else if apiCard.Name == "player" {
return Url{ thumbnail_url = values.PlayerImage.ImageValue.Url
Domain: values.Domain.Value,
Title: values.Title.Value,
Description: values.Description.Value,
ThumbnailRemoteUrl: values.PlayerImage.ImageValue.Url,
ThumbnailLocalPath: get_thumbnail_local_path(values.PlayerImage.ImageValue.Url),
CreatorID: UserID(values.Creator.UserValue.Value),
SiteID: UserID(values.Site.UserValue.Value),
HasCard: true,
IsContentDownloaded: false,
}
} else { } else {
panic("Unknown card type: " + apiCard.Name) panic("Unknown card type: " + apiCard.Name)
} }
if thumbnail_url != "" {
ret.HasThumbnail = true
ret.ThumbnailRemoteUrl = thumbnail_url
ret.ThumbnailLocalPath = get_thumbnail_local_path(thumbnail_url)
}
return ret
} }
func get_thumbnail_local_path(remote_url string) string { func get_thumbnail_local_path(remote_url string) string {

View File

@ -48,6 +48,9 @@ func TestParseAPIUrlCard(t *testing.T) {
if url.SiteID != expected_site_id { if url.SiteID != expected_site_id {
t.Errorf("Expected %d, got %d", expected_site_id, url.SiteID) t.Errorf("Expected %d, got %d", expected_site_id, url.SiteID)
} }
if !url.HasThumbnail {
t.Errorf("Should have a thumbnail, but it doesn't")
}
if url.IsContentDownloaded { if url.IsContentDownloaded {
t.Errorf("Expected it not to be downloaded, but it was") t.Errorf("Expected it not to be downloaded, but it was")
} }
@ -89,7 +92,43 @@ func TestParseAPIUrlCardWithPlayer(t *testing.T) {
if url.SiteID != expected_site_id { if url.SiteID != expected_site_id {
t.Errorf("Expected %d, got %d", expected_site_id, url.SiteID) t.Errorf("Expected %d, got %d", expected_site_id, url.SiteID)
} }
if !url.HasThumbnail {
t.Errorf("Should have a thumbnail, but it doesn't")
}
if url.IsContentDownloaded { if url.IsContentDownloaded {
t.Errorf("Expected it not to be downloaded, but it was") t.Errorf("Expected it not to be downloaded, but it was")
} }
} }
func TestParseAPIUrlCardWithoutThumbnail(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/url_card_without_thumbnail.json")
if err != nil {
panic(err)
}
var apiCard scraper.APICard
err = json.Unmarshal(data, &apiCard)
if err != nil {
t.Fatal(err.Error())
}
url := scraper.ParseAPIUrlCard(apiCard)
expected_domain := "en.m.wikipedia.org"
if url.Domain != expected_domain {
t.Errorf("Expected %q, got %q", expected_domain, url.Domain)
}
expected_title := "Entryism - Wikipedia"
if url.Title != expected_title {
t.Errorf("Expected %q, got %q", expected_title, url.Title)
}
expected_description := ""
if url.Description != expected_description {
t.Errorf("Expected %q, got %q", expected_description, url.Description)
}
if !url.HasCard {
t.Errorf("Expected it to have a card, but it didn't")
}
if url.HasThumbnail {
t.Errorf("Should have no thumbnail, but it does")
}
}