Add handling for URL cards with no link preview thumbnail image

This commit is contained in:
Alessio 2021-09-17 20:50:28 -07:00
parent 9d10fd5942
commit 0bb9ff6c6b
7 changed files with 83 additions and 30 deletions

View File

@ -130,14 +130,25 @@ test $urls_count_after_2x = $urls_count_after
# Download the link's preview image
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1428904664645394433") = "0"
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1428904664645394433") = "0"
test $(find link_preview_images/* | wc -l) = "0"
test $(find link_preview_images | wc -l) = "1"
tw download_tweet_content 1428904664645394433
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1428904664645394433") = "1"
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1428904664645394433") = "1"
test $(find link_preview_images/* | wc -l) = "1"
test $(find link_preview_images | wc -l) = "2"
test -f link_preview_images/WX1Rv2AJ_800x320_1.jpg
# Test a tweet with a URL but no thumbnail
tw fetch_tweet https://twitter.com/Xirong7/status/1413665734866186243
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1413665734866186243") = "0"
test $(sqlite3 twitter.db "select has_thumbnail from urls where tweet_id = 1413665734866186243") = "0"
test $(find link_preview_images | wc -l) = "2"
tw download_tweet_content 1413665734866186243
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1413665734866186243") = "1"
test $(find link_preview_images | wc -l) = "2"
# TODO: Maybe this file should be broken up into multiple test scripts
echo -e "\033[32mAll tests passed. Finished successfully.\033[0m"

View File

@ -76,7 +76,7 @@ func (p Profile) download_tweet_video(v *scraper.Video, downloader MediaDownload
* Downloads an URL thumbnail image, and if successful, marks it as downloaded in the DB
*/
func (p Profile) download_link_thumbnail(url *scraper.Url, downloader MediaDownloader) error {
if url.HasCard {
if url.HasCard && url.HasThumbnail {
outfile := path.Join(p.ProfileDir, "link_preview_images", url.ThumbnailLocalPath)
err := downloader.Curl(url.ThumbnailRemoteUrl, outfile)
if err != nil {

View File

@ -47,12 +47,12 @@ func (p Profile) SaveVideo(vid scraper.Video) error {
*/
func (p Profile) SaveUrl(url scraper.Url) error {
_, err := p.DB.Exec(`
insert into urls (tweet_id, domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded)
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
insert into urls (tweet_id, domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, has_thumbnail, is_content_downloaded)
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
on conflict do update
set is_content_downloaded=?
`,
url.TweetID, url.Domain, url.Text, url.Title, url.Description, url.CreatorID, url.SiteID, url.ThumbnailRemoteUrl, url.ThumbnailLocalPath, url.HasCard, url.IsContentDownloaded,
url.TweetID, url.Domain, url.Text, url.Title, url.Description, url.CreatorID, url.SiteID, url.ThumbnailRemoteUrl, url.ThumbnailLocalPath, url.HasCard, url.HasThumbnail, url.IsContentDownloaded,
url.IsContentDownloaded,
)
return err
@ -114,7 +114,7 @@ func (p Profile) GetVideosForTweet(t scraper.Tweet) (vids []scraper.Video, err e
* Get the list of Urls for a Tweet
*/
func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error) {
stmt, err := p.DB.Prepare("select domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded from urls where tweet_id=? order by rowid")
stmt, err := p.DB.Prepare("select domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, has_thumbnail, is_content_downloaded from urls where tweet_id=? order by rowid")
if err != nil {
return
}
@ -125,7 +125,7 @@ func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error
}
var url scraper.Url
for rows.Next() {
err = rows.Scan(&url.Domain, &url.Text, &url.Title, &url.Description, &url.CreatorID, &url.SiteID, &url.ThumbnailRemoteUrl, &url.ThumbnailLocalPath, &url.HasCard, &url.IsContentDownloaded)
err = rows.Scan(&url.Domain, &url.Text, &url.Title, &url.Description, &url.CreatorID, &url.SiteID, &url.ThumbnailRemoteUrl, &url.ThumbnailLocalPath, &url.HasCard, &url.HasThumbnail, &url.IsContentDownloaded)
if err != nil {
return
}

View File

@ -63,6 +63,7 @@ create table urls (rowid integer primary key,
thumbnail_remote_url text,
thumbnail_local_path text,
has_card boolean,
has_thumbnail boolean,
is_content_downloaded boolean default 0,
unique (tweet_id, text)

View File

@ -0,0 +1 @@
{"name":"summary","url":"https://t.co/BfcswDBBtl","card_type_url":"http://card-type-url-is-deprecated.invalid","binding_values":{"vanity_url":{"type":"STRING","string_value":"en.m.wikipedia.org","scribe_key":"vanity_url"},"domain":{"type":"STRING","string_value":"en.m.wikipedia.org"},"title":{"type":"STRING","string_value":"Entryism - Wikipedia"},"card_url":{"type":"STRING","string_value":"https://t.co/BfcswDBBtl","scribe_key":"card_url"}},"card_platform":{"platform":{"device":{"name":"Swift","version":"12"},"audience":{"name":"production"}}}}

View File

@ -19,38 +19,39 @@ type Url struct {
SiteID UserID
HasCard bool
HasThumbnail bool
IsContentDownloaded bool
}
func ParseAPIUrlCard(apiCard APICard) Url {
values := apiCard.BindingValues
ret := Url{}
ret.HasCard = true
ret.Domain = values.Domain.Value
ret.Title = values.Title.Value
ret.Description = values.Description.Value
ret.IsContentDownloaded = false
ret.CreatorID = UserID(values.Creator.UserValue.Value)
ret.SiteID = UserID(values.Site.UserValue.Value)
var thumbnail_url string
if apiCard.Name == "summary_large_image" || apiCard.Name == "summary" {
return Url{
Domain: values.Domain.Value,
Title: values.Title.Value,
Description: values.Description.Value,
ThumbnailRemoteUrl: values.Thumbnail.ImageValue.Url,
ThumbnailLocalPath: get_thumbnail_local_path(values.Thumbnail.ImageValue.Url),
CreatorID: UserID(values.Creator.UserValue.Value),
SiteID: UserID(values.Site.UserValue.Value),
HasCard: true,
IsContentDownloaded: false,
}
thumbnail_url = values.Thumbnail.ImageValue.Url
} else if apiCard.Name == "player" {
return Url{
Domain: values.Domain.Value,
Title: values.Title.Value,
Description: values.Description.Value,
ThumbnailRemoteUrl: values.PlayerImage.ImageValue.Url,
ThumbnailLocalPath: get_thumbnail_local_path(values.PlayerImage.ImageValue.Url),
CreatorID: UserID(values.Creator.UserValue.Value),
SiteID: UserID(values.Site.UserValue.Value),
HasCard: true,
IsContentDownloaded: false,
}
thumbnail_url = values.PlayerImage.ImageValue.Url
} else {
panic("Unknown card type: " + apiCard.Name)
}
if thumbnail_url != "" {
ret.HasThumbnail = true
ret.ThumbnailRemoteUrl = thumbnail_url
ret.ThumbnailLocalPath = get_thumbnail_local_path(thumbnail_url)
}
return ret
}
func get_thumbnail_local_path(remote_url string) string {

View File

@ -48,6 +48,9 @@ func TestParseAPIUrlCard(t *testing.T) {
if url.SiteID != expected_site_id {
t.Errorf("Expected %d, got %d", expected_site_id, url.SiteID)
}
if !url.HasThumbnail {
t.Errorf("Should have a thumbnail, but it doesn't")
}
if url.IsContentDownloaded {
t.Errorf("Expected it not to be downloaded, but it was")
}
@ -89,7 +92,43 @@ func TestParseAPIUrlCardWithPlayer(t *testing.T) {
if url.SiteID != expected_site_id {
t.Errorf("Expected %d, got %d", expected_site_id, url.SiteID)
}
if !url.HasThumbnail {
t.Errorf("Should have a thumbnail, but it doesn't")
}
if url.IsContentDownloaded {
t.Errorf("Expected it not to be downloaded, but it was")
}
}
func TestParseAPIUrlCardWithoutThumbnail(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/url_card_without_thumbnail.json")
if err != nil {
panic(err)
}
var apiCard scraper.APICard
err = json.Unmarshal(data, &apiCard)
if err != nil {
t.Fatal(err.Error())
}
url := scraper.ParseAPIUrlCard(apiCard)
expected_domain := "en.m.wikipedia.org"
if url.Domain != expected_domain {
t.Errorf("Expected %q, got %q", expected_domain, url.Domain)
}
expected_title := "Entryism - Wikipedia"
if url.Title != expected_title {
t.Errorf("Expected %q, got %q", expected_title, url.Title)
}
expected_description := ""
if url.Description != expected_description {
t.Errorf("Expected %q, got %q", expected_description, url.Description)
}
if !url.HasCard {
t.Errorf("Expected it to have a card, but it didn't")
}
if url.HasThumbnail {
t.Errorf("Should have no thumbnail, but it does")
}
}