diff --git a/cmd/tests.sh b/cmd/tests.sh index 1ba26a4..0aeabf3 100755 --- a/cmd/tests.sh +++ b/cmd/tests.sh @@ -130,14 +130,25 @@ test $urls_count_after_2x = $urls_count_after # Download the link's preview image test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1428904664645394433") = "0" test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1428904664645394433") = "0" -test $(find link_preview_images/* | wc -l) = "0" +test $(find link_preview_images | wc -l) = "1" tw download_tweet_content 1428904664645394433 test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1428904664645394433") = "1" test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1428904664645394433") = "1" -test $(find link_preview_images/* | wc -l) = "1" +test $(find link_preview_images | wc -l) = "2" test -f link_preview_images/WX1Rv2AJ_800x320_1.jpg +# Test a tweet with a URL but no thumbnail +tw fetch_tweet https://twitter.com/Xirong7/status/1413665734866186243 +test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1413665734866186243") = "0" +test $(sqlite3 twitter.db "select has_thumbnail from urls where tweet_id = 1413665734866186243") = "0" +test $(find link_preview_images | wc -l) = "2" +tw download_tweet_content 1413665734866186243 +test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1413665734866186243") = "1" +test $(find link_preview_images | wc -l) = "2" + + + # TODO: Maybe this file should be broken up into multiple test scripts echo -e "\033[32mAll tests passed. Finished successfully.\033[0m" diff --git a/persistence/media_download.go b/persistence/media_download.go index fbd7380..9eab937 100644 --- a/persistence/media_download.go +++ b/persistence/media_download.go @@ -76,7 +76,7 @@ func (p Profile) download_tweet_video(v *scraper.Video, downloader MediaDownload * Downloads an URL thumbnail image, and if successful, marks it as downloaded in the DB */ func (p Profile) download_link_thumbnail(url *scraper.Url, downloader MediaDownloader) error { - if url.HasCard { + if url.HasCard && url.HasThumbnail { outfile := path.Join(p.ProfileDir, "link_preview_images", url.ThumbnailLocalPath) err := downloader.Curl(url.ThumbnailRemoteUrl, outfile) if err != nil { diff --git a/persistence/media_queries.go b/persistence/media_queries.go index cfd6c6a..8709417 100644 --- a/persistence/media_queries.go +++ b/persistence/media_queries.go @@ -47,12 +47,12 @@ func (p Profile) SaveVideo(vid scraper.Video) error { */ func (p Profile) SaveUrl(url scraper.Url) error { _, err := p.DB.Exec(` - insert into urls (tweet_id, domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded) - values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + insert into urls (tweet_id, domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, has_thumbnail, is_content_downloaded) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) on conflict do update set is_content_downloaded=? `, - url.TweetID, url.Domain, url.Text, url.Title, url.Description, url.CreatorID, url.SiteID, url.ThumbnailRemoteUrl, url.ThumbnailLocalPath, url.HasCard, url.IsContentDownloaded, + url.TweetID, url.Domain, url.Text, url.Title, url.Description, url.CreatorID, url.SiteID, url.ThumbnailRemoteUrl, url.ThumbnailLocalPath, url.HasCard, url.HasThumbnail, url.IsContentDownloaded, url.IsContentDownloaded, ) return err @@ -114,7 +114,7 @@ func (p Profile) GetVideosForTweet(t scraper.Tweet) (vids []scraper.Video, err e * Get the list of Urls for a Tweet */ func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error) { - stmt, err := p.DB.Prepare("select domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded from urls where tweet_id=? order by rowid") + stmt, err := p.DB.Prepare("select domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, has_thumbnail, is_content_downloaded from urls where tweet_id=? order by rowid") if err != nil { return } @@ -125,7 +125,7 @@ func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error } var url scraper.Url for rows.Next() { - err = rows.Scan(&url.Domain, &url.Text, &url.Title, &url.Description, &url.CreatorID, &url.SiteID, &url.ThumbnailRemoteUrl, &url.ThumbnailLocalPath, &url.HasCard, &url.IsContentDownloaded) + err = rows.Scan(&url.Domain, &url.Text, &url.Title, &url.Description, &url.CreatorID, &url.SiteID, &url.ThumbnailRemoteUrl, &url.ThumbnailLocalPath, &url.HasCard, &url.HasThumbnail, &url.IsContentDownloaded) if err != nil { return } diff --git a/persistence/schema.sql b/persistence/schema.sql index 5a59e19..362a89f 100644 --- a/persistence/schema.sql +++ b/persistence/schema.sql @@ -63,6 +63,7 @@ create table urls (rowid integer primary key, thumbnail_remote_url text, thumbnail_local_path text, has_card boolean, + has_thumbnail boolean, is_content_downloaded boolean default 0, unique (tweet_id, text) diff --git a/scraper/test_responses/url_card_without_thumbnail.json b/scraper/test_responses/url_card_without_thumbnail.json new file mode 100644 index 0000000..abb83fc --- /dev/null +++ b/scraper/test_responses/url_card_without_thumbnail.json @@ -0,0 +1 @@ +{"name":"summary","url":"https://t.co/BfcswDBBtl","card_type_url":"http://card-type-url-is-deprecated.invalid","binding_values":{"vanity_url":{"type":"STRING","string_value":"en.m.wikipedia.org","scribe_key":"vanity_url"},"domain":{"type":"STRING","string_value":"en.m.wikipedia.org"},"title":{"type":"STRING","string_value":"Entryism - Wikipedia"},"card_url":{"type":"STRING","string_value":"https://t.co/BfcswDBBtl","scribe_key":"card_url"}},"card_platform":{"platform":{"device":{"name":"Swift","version":"12"},"audience":{"name":"production"}}}} diff --git a/scraper/url.go b/scraper/url.go index 3c5f9e6..733cce3 100644 --- a/scraper/url.go +++ b/scraper/url.go @@ -19,38 +19,39 @@ type Url struct { SiteID UserID HasCard bool + HasThumbnail bool IsContentDownloaded bool } func ParseAPIUrlCard(apiCard APICard) Url { values := apiCard.BindingValues + ret := Url{} + ret.HasCard = true + + ret.Domain = values.Domain.Value + ret.Title = values.Title.Value + ret.Description = values.Description.Value + ret.IsContentDownloaded = false + ret.CreatorID = UserID(values.Creator.UserValue.Value) + ret.SiteID = UserID(values.Site.UserValue.Value) + + var thumbnail_url string + if apiCard.Name == "summary_large_image" || apiCard.Name == "summary" { - return Url{ - Domain: values.Domain.Value, - Title: values.Title.Value, - Description: values.Description.Value, - ThumbnailRemoteUrl: values.Thumbnail.ImageValue.Url, - ThumbnailLocalPath: get_thumbnail_local_path(values.Thumbnail.ImageValue.Url), - CreatorID: UserID(values.Creator.UserValue.Value), - SiteID: UserID(values.Site.UserValue.Value), - HasCard: true, - IsContentDownloaded: false, - } + thumbnail_url = values.Thumbnail.ImageValue.Url } else if apiCard.Name == "player" { - return Url{ - Domain: values.Domain.Value, - Title: values.Title.Value, - Description: values.Description.Value, - ThumbnailRemoteUrl: values.PlayerImage.ImageValue.Url, - ThumbnailLocalPath: get_thumbnail_local_path(values.PlayerImage.ImageValue.Url), - CreatorID: UserID(values.Creator.UserValue.Value), - SiteID: UserID(values.Site.UserValue.Value), - HasCard: true, - IsContentDownloaded: false, - } + thumbnail_url = values.PlayerImage.ImageValue.Url } else { panic("Unknown card type: " + apiCard.Name) } + + if thumbnail_url != "" { + ret.HasThumbnail = true + ret.ThumbnailRemoteUrl = thumbnail_url + ret.ThumbnailLocalPath = get_thumbnail_local_path(thumbnail_url) + } + + return ret } func get_thumbnail_local_path(remote_url string) string { diff --git a/scraper/url_test.go b/scraper/url_test.go index 630ef73..2ae7caa 100644 --- a/scraper/url_test.go +++ b/scraper/url_test.go @@ -48,6 +48,9 @@ func TestParseAPIUrlCard(t *testing.T) { if url.SiteID != expected_site_id { t.Errorf("Expected %d, got %d", expected_site_id, url.SiteID) } + if !url.HasThumbnail { + t.Errorf("Should have a thumbnail, but it doesn't") + } if url.IsContentDownloaded { t.Errorf("Expected it not to be downloaded, but it was") } @@ -89,7 +92,43 @@ func TestParseAPIUrlCardWithPlayer(t *testing.T) { if url.SiteID != expected_site_id { t.Errorf("Expected %d, got %d", expected_site_id, url.SiteID) } + if !url.HasThumbnail { + t.Errorf("Should have a thumbnail, but it doesn't") + } if url.IsContentDownloaded { t.Errorf("Expected it not to be downloaded, but it was") } } + +func TestParseAPIUrlCardWithoutThumbnail(t *testing.T) { + data, err := ioutil.ReadFile("test_responses/url_card_without_thumbnail.json") + if err != nil { + panic(err) + } + var apiCard scraper.APICard + err = json.Unmarshal(data, &apiCard) + if err != nil { + t.Fatal(err.Error()) + } + url := scraper.ParseAPIUrlCard(apiCard) + + expected_domain := "en.m.wikipedia.org" + if url.Domain != expected_domain { + t.Errorf("Expected %q, got %q", expected_domain, url.Domain) + } + expected_title := "Entryism - Wikipedia" + if url.Title != expected_title { + t.Errorf("Expected %q, got %q", expected_title, url.Title) + } + expected_description := "" + if url.Description != expected_description { + t.Errorf("Expected %q, got %q", expected_description, url.Description) + } + + if !url.HasCard { + t.Errorf("Expected it to have a card, but it didn't") + } + if url.HasThumbnail { + t.Errorf("Should have no thumbnail, but it does") + } +}