Put videos, images, and video-thumbnails into subdirectories

This commit is contained in:
Alessio 2022-10-15 15:06:06 -04:00
parent 6de2d670e8
commit 069ddcd976
8 changed files with 54 additions and 23 deletions

View File

@ -44,12 +44,12 @@ test $(sqlite3 twitter.db "select count(*) from images") = "4"
test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 0") = "4" test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 0") = "4"
test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 1") = "0" test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 1") = "0"
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1503016316642689026") = "0" test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1503016316642689026") = "0"
test $(find images | wc -l) = "1" test $(find images -mindepth 2 | wc -l) = "0"
tw download_tweet_content https://twitter.com/wrathofgnon/status/1503016316642689026 tw download_tweet_content https://twitter.com/wrathofgnon/status/1503016316642689026
test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 0") = "0" test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 0") = "0"
test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 1") = "4" test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 1") = "4"
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1503016316642689026") = "1" test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1503016316642689026") = "1"
test $(find images | wc -l) = "5" test $(find images -mindepth 2 | wc -l) = "4"
# Try to double-download it # Try to double-download it
tw fetch_tweet_only https://twitter.com/wrathofgnon/status/1503016316642689026 tw fetch_tweet_only https://twitter.com/wrathofgnon/status/1503016316642689026
@ -68,14 +68,14 @@ test $(sqlite3 twitter.db "select count(*) from videos") = "1"
test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 0") = "1" test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 0") = "1"
test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 1") = "0" test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 1") = "0"
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1581025285524242432") = "0" test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1581025285524242432") = "0"
test $(find videos | wc -l) = "1" test $(find videos -mindepth 2 | wc -l) = "0"
test $(find video_thumbnails | wc -l) = "1" test $(find video_thumbnails -mindepth 2| wc -l) = "0"
tw download_tweet_content 1581025285524242432 tw download_tweet_content 1581025285524242432
test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 0") = "0" test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 0") = "0"
test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 1") = "1" test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 1") = "1"
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1581025285524242432") = "1" test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1581025285524242432") = "1"
test $(find videos | wc -l) = "2" test $(find videos -mindepth 2 | wc -l) = "1"
test $(find video_thumbnails | wc -l) = "2" test $(find video_thumbnails -mindepth 2 | wc -l) = "1"
# Try to double-download it # Try to double-download it
tw fetch_tweet_only https://twitter.com/SpaceX/status/1581025285524242432 tw fetch_tweet_only https://twitter.com/SpaceX/status/1581025285524242432
@ -85,7 +85,7 @@ test $(sqlite3 twitter.db "select count(*) from videos") = "1"
# Fetch a tweet with a GIF # Fetch a tweet with a GIF
tw fetch_user Cernovich tw fetch_user Cernovich
initial_videos_count=$(find videos | wc -l) initial_videos_count=$(find videos -mindepth 2 | wc -l) # Don't count prefix dirs
initial_videos_db_count=$(sqlite3 twitter.db "select count(*) from videos") initial_videos_db_count=$(sqlite3 twitter.db "select count(*) from videos")
tw fetch_tweet_only https://twitter.com/Cernovich/status/1444429517020274693 tw fetch_tweet_only https://twitter.com/Cernovich/status/1444429517020274693
@ -93,9 +93,9 @@ test $(sqlite3 twitter.db "select count(*) from videos") = "$((initial_videos_db
test $(sqlite3 twitter.db "select is_gif from videos where tweet_id = 1444429517020274693") = "1" test $(sqlite3 twitter.db "select is_gif from videos where tweet_id = 1444429517020274693") = "1"
# Download the GIF # Download the GIF
test $(find videos | wc -l) = "$((initial_videos_count))" # Shouldn't have changed yet test $(find videos -mindepth 2 | wc -l) = "$((initial_videos_count))" # Shouldn't have changed yet
tw download_tweet_content https://twitter.com/Cernovich/status/1444429517020274693 tw download_tweet_content https://twitter.com/Cernovich/status/1444429517020274693
test $(find videos | wc -l) = "$((initial_videos_count + 1))" test $(find videos -mindepth 2 | wc -l) = "$((initial_videos_count + 1))"
# Fetch a tweet with a poll # Fetch a tweet with a poll
@ -189,12 +189,12 @@ test $urls_count_after_2x = $urls_count_after
# Download the link's preview image # Download the link's preview image
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1024074310082748416") = "0" test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1024074310082748416") = "0"
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1024074310082748416") = "0" test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1024074310082748416") = "0"
initial_link_preview_images_count=$(find link_preview_images | wc -l) initial_link_preview_images_count=$(find link_preview_images -mindepth 2 | wc -l)
tw download_tweet_content 1024074310082748416 tw download_tweet_content 1024074310082748416
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1024074310082748416") = "1" test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1024074310082748416") = "1"
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1024074310082748416") = "1" test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1024074310082748416") = "1"
test $(find link_preview_images | wc -l) = "$((initial_link_preview_images_count + 1))" test $(find link_preview_images -mindepth 2 | wc -l) = "$((initial_link_preview_images_count + 1))"
test -f link_preview_images/${thumbnail_name}_400x400.jpg find link_preview_images | grep ${thumbnail_name}_400x400.jpg
# Test a tweet with a URL but no thumbnail # Test a tweet with a URL but no thumbnail

View File

@ -39,6 +39,16 @@ func (d DefaultDownloader) Curl(url string, outpath string) error {
return fmt.Errorf("Error downloading image %s:\n %w", url, err) return fmt.Errorf("Error downloading image %s:\n %w", url, err)
} }
// Ensure the output directory exists
dirname := path.Dir(outpath)
if dirname != "." {
err = os.Mkdir(dirname, 0755)
if err != nil {
panic(err)
}
}
// Write the downloaded data
err = os.WriteFile(outpath, data, 0644) err = os.WriteFile(outpath, data, 0644)
if err != nil { if err != nil {
return fmt.Errorf("Error writing to path %s, url %s:\n %w", outpath, url, err) return fmt.Errorf("Error writing to path %s, url %s:\n %w", outpath, url, err)

View File

@ -17,7 +17,8 @@ type Image struct {
} }
func ParseAPIMedia(apiMedia APIMedia) Image { func ParseAPIMedia(apiMedia APIMedia) Image {
local_filename := path.Base(apiMedia.MediaURLHttps) local_filename := get_prefixed_path(path.Base(apiMedia.MediaURLHttps))
return Image{ return Image{
ID: ImageID(apiMedia.ID), ID: ImageID(apiMedia.ID),
RemoteURL: apiMedia.MediaURLHttps, RemoteURL: apiMedia.MediaURLHttps,

View File

@ -26,6 +26,6 @@ func TestParseAPIMedia(t *testing.T) {
assert.Equal("https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg", image.RemoteURL) assert.Equal("https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg", image.RemoteURL)
assert.Equal(593, image.Width) assert.Equal(593, image.Width)
assert.Equal(239, image.Height) assert.Equal(239, image.Height)
assert.Equal("E18sEUrWYAk8dBl.jpg", image.LocalFilename) assert.Equal("E1/E18sEUrWYAk8dBl.jpg", image.LocalFilename)
assert.False(image.IsDownloaded) assert.False(image.IsDownloaded)
} }

View File

@ -60,6 +60,15 @@ func ParseAPIUrlCard(apiCard APICard) Url {
return ret return ret
} }
func get_prefixed_path(p string) string {
local_prefix_regex := regexp.MustCompile(`^[\w-]{2}`)
local_prefix := local_prefix_regex.FindString(p)
if len(local_prefix) != 2 {
panic(fmt.Sprintf("Unable to extract a 2-letter prefix for filename %s", p))
}
return path.Join(local_prefix, p)
}
func get_thumbnail_local_path(remote_url string) string { func get_thumbnail_local_path(remote_url string) string {
u, err := url.Parse(remote_url) u, err := url.Parse(remote_url)
if err != nil { if err != nil {
@ -73,7 +82,9 @@ func get_thumbnail_local_path(remote_url string) string {
panic(err) panic(err)
} }
return fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0]) return get_prefixed_path(
fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0]),
)
} }
/** /**

View File

@ -29,7 +29,7 @@ func TestParseAPIUrlCard(t *testing.T) {
assert.Equal(600, url.ThumbnailWidth) assert.Equal(600, url.ThumbnailWidth)
assert.Equal(315, url.ThumbnailHeight) assert.Equal(315, url.ThumbnailHeight)
assert.Equal("https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x600", url.ThumbnailRemoteUrl) assert.Equal("https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x600", url.ThumbnailRemoteUrl)
assert.Equal("odDi9EqO_600x600.jpg", url.ThumbnailLocalPath) assert.Equal("od/odDi9EqO_600x600.jpg", url.ThumbnailLocalPath)
assert.Equal(UserID(155581583), url.CreatorID) assert.Equal(UserID(155581583), url.CreatorID)
assert.Equal(UserID(16467567), url.SiteID) assert.Equal(UserID(16467567), url.SiteID)
assert.True(url.HasThumbnail) assert.True(url.HasThumbnail)
@ -52,7 +52,7 @@ func TestParseAPIUrlCardWithPlayer(t *testing.T) {
assert.Equal("Watch this episode on LBRY/Odysee: https://odysee.com/@capitalresearch:5/the-politically-incorrect-guide-to-the:8"+ assert.Equal("Watch this episode on LBRY/Odysee: https://odysee.com/@capitalresearch:5/the-politically-incorrect-guide-to-the:8"+
"Watch this episode on Rumble: https://rumble...", url.Description) "Watch this episode on Rumble: https://rumble...", url.Description)
assert.Equal("https://pbs.twimg.com/card_img/1437849456423194639/_1t0btyt?format=jpg&name=800x320_1", url.ThumbnailRemoteUrl) assert.Equal("https://pbs.twimg.com/card_img/1437849456423194639/_1t0btyt?format=jpg&name=800x320_1", url.ThumbnailRemoteUrl)
assert.Equal("_1t0btyt_800x320_1.jpg", url.ThumbnailLocalPath) assert.Equal("_1/_1t0btyt_800x320_1.jpg", url.ThumbnailLocalPath)
assert.Equal(UserID(10228272), url.SiteID) assert.Equal(UserID(10228272), url.SiteID)
assert.True(url.HasThumbnail) assert.True(url.HasThumbnail)
assert.False(url.IsContentDownloaded) assert.False(url.IsContentDownloaded)

View File

@ -1,7 +1,7 @@
package scraper package scraper
import ( import (
"fmt" "net/url"
"path" "path"
"sort" "sort"
) )
@ -28,9 +28,18 @@ type Video struct {
IsGif bool IsGif bool
} }
func get_filename(remote_url string) string {
u, err := url.Parse(remote_url)
if err != nil {
panic(err)
}
return path.Base(u.Path)
}
func ParseAPIVideo(apiVideo APIExtendedMedia, tweet_id TweetID) Video { func ParseAPIVideo(apiVideo APIExtendedMedia, tweet_id TweetID) Video {
variants := apiVideo.VideoInfo.Variants variants := apiVideo.VideoInfo.Variants
sort.Sort(variants) sort.Sort(variants)
video_remote_url := variants[0].URL
var view_count int var view_count int
@ -51,18 +60,18 @@ func ParseAPIVideo(apiVideo APIExtendedMedia, tweet_id TweetID) Video {
} }
} }
local_filename := fmt.Sprintf("%d.mp4", tweet_id) local_filename := get_prefixed_path(get_filename(video_remote_url))
return Video{ return Video{
ID: VideoID(apiVideo.ID), ID: VideoID(apiVideo.ID),
TweetID: tweet_id, TweetID: tweet_id,
Width: apiVideo.OriginalInfo.Width, Width: apiVideo.OriginalInfo.Width,
Height: apiVideo.OriginalInfo.Height, Height: apiVideo.OriginalInfo.Height,
RemoteURL: variants[0].URL, RemoteURL: video_remote_url,
LocalFilename: local_filename, LocalFilename: local_filename,
ThumbnailRemoteUrl: apiVideo.MediaURLHttps, ThumbnailRemoteUrl: apiVideo.MediaURLHttps,
ThumbnailLocalPath: path.Base(apiVideo.MediaURLHttps), ThumbnailLocalPath: get_prefixed_path(path.Base(apiVideo.MediaURLHttps)),
Duration: apiVideo.VideoInfo.Duration, Duration: apiVideo.VideoInfo.Duration,
ViewCount: view_count, ViewCount: view_count,

View File

@ -28,9 +28,9 @@ func TestParseAPIVideo(t *testing.T) {
assert.Equal(1280, video.Height) assert.Equal(1280, video.Height)
assert.Equal(720, video.Width) assert.Equal(720, video.Width)
assert.Equal("https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/720x1280/sm4iL9_f8Lclh0aa.mp4?tag=12", video.RemoteURL) assert.Equal("https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/720x1280/sm4iL9_f8Lclh0aa.mp4?tag=12", video.RemoteURL)
assert.Equal("28.mp4", video.LocalFilename) assert.Equal("sm/sm4iL9_f8Lclh0aa.mp4", video.LocalFilename)
assert.Equal("https://pbs.twimg.com/ext_tw_video_thumb/1418951950020845568/pu/img/eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailRemoteUrl) assert.Equal("https://pbs.twimg.com/ext_tw_video_thumb/1418951950020845568/pu/img/eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailRemoteUrl)
assert.Equal("eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailLocalPath) assert.Equal("eU/eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailLocalPath)
assert.Equal(275952, video.ViewCount) assert.Equal(275952, video.ViewCount)
assert.Equal(88300, video.Duration) assert.Equal(88300, video.Duration)
assert.False(video.IsDownloaded) assert.False(video.IsDownloaded)