diff --git a/cmd/tests.sh b/cmd/tests.sh index 2cdcbd8..f3920b0 100755 --- a/cmd/tests.sh +++ b/cmd/tests.sh @@ -44,12 +44,12 @@ test $(sqlite3 twitter.db "select count(*) from images") = "4" test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 0") = "4" test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 1") = "0" test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1503016316642689026") = "0" -test $(find images | wc -l) = "1" +test $(find images -mindepth 2 | wc -l) = "0" tw download_tweet_content https://twitter.com/wrathofgnon/status/1503016316642689026 test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 0") = "0" test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 1") = "4" test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1503016316642689026") = "1" -test $(find images | wc -l) = "5" +test $(find images -mindepth 2 | wc -l) = "4" # Try to double-download it tw fetch_tweet_only https://twitter.com/wrathofgnon/status/1503016316642689026 @@ -68,14 +68,14 @@ test $(sqlite3 twitter.db "select count(*) from videos") = "1" test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 0") = "1" test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 1") = "0" test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1581025285524242432") = "0" -test $(find videos | wc -l) = "1" -test $(find video_thumbnails | wc -l) = "1" +test $(find videos -mindepth 2 | wc -l) = "0" +test $(find video_thumbnails -mindepth 2| wc -l) = "0" tw download_tweet_content 1581025285524242432 test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 0") = "0" test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 1") = "1" test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1581025285524242432") = "1" -test $(find videos | wc -l) = "2" -test $(find video_thumbnails | wc -l) = "2" +test $(find videos -mindepth 2 | wc -l) = "1" +test $(find video_thumbnails -mindepth 2 | wc -l) = "1" # Try to double-download it tw fetch_tweet_only https://twitter.com/SpaceX/status/1581025285524242432 @@ -85,7 +85,7 @@ test $(sqlite3 twitter.db "select count(*) from videos") = "1" # Fetch a tweet with a GIF tw fetch_user Cernovich -initial_videos_count=$(find videos | wc -l) +initial_videos_count=$(find videos -mindepth 2 | wc -l) # Don't count prefix dirs initial_videos_db_count=$(sqlite3 twitter.db "select count(*) from videos") tw fetch_tweet_only https://twitter.com/Cernovich/status/1444429517020274693 @@ -93,9 +93,9 @@ test $(sqlite3 twitter.db "select count(*) from videos") = "$((initial_videos_db test $(sqlite3 twitter.db "select is_gif from videos where tweet_id = 1444429517020274693") = "1" # Download the GIF -test $(find videos | wc -l) = "$((initial_videos_count))" # Shouldn't have changed yet +test $(find videos -mindepth 2 | wc -l) = "$((initial_videos_count))" # Shouldn't have changed yet tw download_tweet_content https://twitter.com/Cernovich/status/1444429517020274693 -test $(find videos | wc -l) = "$((initial_videos_count + 1))" +test $(find videos -mindepth 2 | wc -l) = "$((initial_videos_count + 1))" # Fetch a tweet with a poll @@ -189,12 +189,12 @@ test $urls_count_after_2x = $urls_count_after # Download the link's preview image test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1024074310082748416") = "0" test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1024074310082748416") = "0" -initial_link_preview_images_count=$(find link_preview_images | wc -l) +initial_link_preview_images_count=$(find link_preview_images -mindepth 2 | wc -l) tw download_tweet_content 1024074310082748416 test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1024074310082748416") = "1" test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1024074310082748416") = "1" -test $(find link_preview_images | wc -l) = "$((initial_link_preview_images_count + 1))" -test -f link_preview_images/${thumbnail_name}_400x400.jpg +test $(find link_preview_images -mindepth 2 | wc -l) = "$((initial_link_preview_images_count + 1))" +find link_preview_images | grep ${thumbnail_name}_400x400.jpg # Test a tweet with a URL but no thumbnail diff --git a/persistence/media_download.go b/persistence/media_download.go index ddd7406..a96ad67 100644 --- a/persistence/media_download.go +++ b/persistence/media_download.go @@ -39,6 +39,16 @@ func (d DefaultDownloader) Curl(url string, outpath string) error { return fmt.Errorf("Error downloading image %s:\n %w", url, err) } + // Ensure the output directory exists + dirname := path.Dir(outpath) + if dirname != "." { + err = os.Mkdir(dirname, 0755) + if err != nil { + panic(err) + } + } + + // Write the downloaded data err = os.WriteFile(outpath, data, 0644) if err != nil { return fmt.Errorf("Error writing to path %s, url %s:\n %w", outpath, url, err) diff --git a/scraper/image.go b/scraper/image.go index 3e043cb..f13031f 100644 --- a/scraper/image.go +++ b/scraper/image.go @@ -17,7 +17,8 @@ type Image struct { } func ParseAPIMedia(apiMedia APIMedia) Image { - local_filename := path.Base(apiMedia.MediaURLHttps) + local_filename := get_prefixed_path(path.Base(apiMedia.MediaURLHttps)) + return Image{ ID: ImageID(apiMedia.ID), RemoteURL: apiMedia.MediaURLHttps, diff --git a/scraper/image_test.go b/scraper/image_test.go index ef4bb37..8bb14c9 100644 --- a/scraper/image_test.go +++ b/scraper/image_test.go @@ -26,6 +26,6 @@ func TestParseAPIMedia(t *testing.T) { assert.Equal("https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg", image.RemoteURL) assert.Equal(593, image.Width) assert.Equal(239, image.Height) - assert.Equal("E18sEUrWYAk8dBl.jpg", image.LocalFilename) + assert.Equal("E1/E18sEUrWYAk8dBl.jpg", image.LocalFilename) assert.False(image.IsDownloaded) } diff --git a/scraper/url.go b/scraper/url.go index 1722a15..ec77f49 100644 --- a/scraper/url.go +++ b/scraper/url.go @@ -60,6 +60,15 @@ func ParseAPIUrlCard(apiCard APICard) Url { return ret } +func get_prefixed_path(p string) string { + local_prefix_regex := regexp.MustCompile(`^[\w-]{2}`) + local_prefix := local_prefix_regex.FindString(p) + if len(local_prefix) != 2 { + panic(fmt.Sprintf("Unable to extract a 2-letter prefix for filename %s", p)) + } + return path.Join(local_prefix, p) +} + func get_thumbnail_local_path(remote_url string) string { u, err := url.Parse(remote_url) if err != nil { @@ -73,7 +82,9 @@ func get_thumbnail_local_path(remote_url string) string { panic(err) } - return fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0]) + return get_prefixed_path( + fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0]), + ) } /** diff --git a/scraper/url_test.go b/scraper/url_test.go index 4004898..f74d2af 100644 --- a/scraper/url_test.go +++ b/scraper/url_test.go @@ -29,7 +29,7 @@ func TestParseAPIUrlCard(t *testing.T) { assert.Equal(600, url.ThumbnailWidth) assert.Equal(315, url.ThumbnailHeight) assert.Equal("https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x600", url.ThumbnailRemoteUrl) - assert.Equal("odDi9EqO_600x600.jpg", url.ThumbnailLocalPath) + assert.Equal("od/odDi9EqO_600x600.jpg", url.ThumbnailLocalPath) assert.Equal(UserID(155581583), url.CreatorID) assert.Equal(UserID(16467567), url.SiteID) assert.True(url.HasThumbnail) @@ -52,7 +52,7 @@ func TestParseAPIUrlCardWithPlayer(t *testing.T) { assert.Equal("Watch this episode on LBRY/Odysee: https://odysee.com/@capitalresearch:5/the-politically-incorrect-guide-to-the:8"+ "Watch this episode on Rumble: https://rumble...", url.Description) assert.Equal("https://pbs.twimg.com/card_img/1437849456423194639/_1t0btyt?format=jpg&name=800x320_1", url.ThumbnailRemoteUrl) - assert.Equal("_1t0btyt_800x320_1.jpg", url.ThumbnailLocalPath) + assert.Equal("_1/_1t0btyt_800x320_1.jpg", url.ThumbnailLocalPath) assert.Equal(UserID(10228272), url.SiteID) assert.True(url.HasThumbnail) assert.False(url.IsContentDownloaded) diff --git a/scraper/video.go b/scraper/video.go index ff0380c..fde0e36 100644 --- a/scraper/video.go +++ b/scraper/video.go @@ -1,7 +1,7 @@ package scraper import ( - "fmt" + "net/url" "path" "sort" ) @@ -28,9 +28,18 @@ type Video struct { IsGif bool } +func get_filename(remote_url string) string { + u, err := url.Parse(remote_url) + if err != nil { + panic(err) + } + return path.Base(u.Path) +} + func ParseAPIVideo(apiVideo APIExtendedMedia, tweet_id TweetID) Video { variants := apiVideo.VideoInfo.Variants sort.Sort(variants) + video_remote_url := variants[0].URL var view_count int @@ -51,18 +60,18 @@ func ParseAPIVideo(apiVideo APIExtendedMedia, tweet_id TweetID) Video { } } - local_filename := fmt.Sprintf("%d.mp4", tweet_id) + local_filename := get_prefixed_path(get_filename(video_remote_url)) return Video{ ID: VideoID(apiVideo.ID), TweetID: tweet_id, Width: apiVideo.OriginalInfo.Width, Height: apiVideo.OriginalInfo.Height, - RemoteURL: variants[0].URL, + RemoteURL: video_remote_url, LocalFilename: local_filename, ThumbnailRemoteUrl: apiVideo.MediaURLHttps, - ThumbnailLocalPath: path.Base(apiVideo.MediaURLHttps), + ThumbnailLocalPath: get_prefixed_path(path.Base(apiVideo.MediaURLHttps)), Duration: apiVideo.VideoInfo.Duration, ViewCount: view_count, diff --git a/scraper/video_test.go b/scraper/video_test.go index 5ef2ea4..856683d 100644 --- a/scraper/video_test.go +++ b/scraper/video_test.go @@ -28,9 +28,9 @@ func TestParseAPIVideo(t *testing.T) { assert.Equal(1280, video.Height) assert.Equal(720, video.Width) assert.Equal("https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/720x1280/sm4iL9_f8Lclh0aa.mp4?tag=12", video.RemoteURL) - assert.Equal("28.mp4", video.LocalFilename) + assert.Equal("sm/sm4iL9_f8Lclh0aa.mp4", video.LocalFilename) assert.Equal("https://pbs.twimg.com/ext_tw_video_thumb/1418951950020845568/pu/img/eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailRemoteUrl) - assert.Equal("eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailLocalPath) + assert.Equal("eU/eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailLocalPath) assert.Equal(275952, video.ViewCount) assert.Equal(88300, video.Duration) assert.False(video.IsDownloaded)