Put videos, images, and video-thumbnails into subdirectories

This commit is contained in:
Alessio 2022-10-15 15:06:06 -04:00
parent 6de2d670e8
commit 069ddcd976
8 changed files with 54 additions and 23 deletions

View File

@ -44,12 +44,12 @@ test $(sqlite3 twitter.db "select count(*) from images") = "4"
test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 0") = "4"
test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 1") = "0"
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1503016316642689026") = "0"
test $(find images | wc -l) = "1"
test $(find images -mindepth 2 | wc -l) = "0"
tw download_tweet_content https://twitter.com/wrathofgnon/status/1503016316642689026
test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 0") = "0"
test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 1") = "4"
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1503016316642689026") = "1"
test $(find images | wc -l) = "5"
test $(find images -mindepth 2 | wc -l) = "4"
# Try to double-download it
tw fetch_tweet_only https://twitter.com/wrathofgnon/status/1503016316642689026
@ -68,14 +68,14 @@ test $(sqlite3 twitter.db "select count(*) from videos") = "1"
test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 0") = "1"
test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 1") = "0"
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1581025285524242432") = "0"
test $(find videos | wc -l) = "1"
test $(find video_thumbnails | wc -l) = "1"
test $(find videos -mindepth 2 | wc -l) = "0"
test $(find video_thumbnails -mindepth 2| wc -l) = "0"
tw download_tweet_content 1581025285524242432
test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 0") = "0"
test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 1") = "1"
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1581025285524242432") = "1"
test $(find videos | wc -l) = "2"
test $(find video_thumbnails | wc -l) = "2"
test $(find videos -mindepth 2 | wc -l) = "1"
test $(find video_thumbnails -mindepth 2 | wc -l) = "1"
# Try to double-download it
tw fetch_tweet_only https://twitter.com/SpaceX/status/1581025285524242432
@ -85,7 +85,7 @@ test $(sqlite3 twitter.db "select count(*) from videos") = "1"
# Fetch a tweet with a GIF
tw fetch_user Cernovich
initial_videos_count=$(find videos | wc -l)
initial_videos_count=$(find videos -mindepth 2 | wc -l) # Don't count prefix dirs
initial_videos_db_count=$(sqlite3 twitter.db "select count(*) from videos")
tw fetch_tweet_only https://twitter.com/Cernovich/status/1444429517020274693
@ -93,9 +93,9 @@ test $(sqlite3 twitter.db "select count(*) from videos") = "$((initial_videos_db
test $(sqlite3 twitter.db "select is_gif from videos where tweet_id = 1444429517020274693") = "1"
# Download the GIF
test $(find videos | wc -l) = "$((initial_videos_count))" # Shouldn't have changed yet
test $(find videos -mindepth 2 | wc -l) = "$((initial_videos_count))" # Shouldn't have changed yet
tw download_tweet_content https://twitter.com/Cernovich/status/1444429517020274693
test $(find videos | wc -l) = "$((initial_videos_count + 1))"
test $(find videos -mindepth 2 | wc -l) = "$((initial_videos_count + 1))"
# Fetch a tweet with a poll
@ -189,12 +189,12 @@ test $urls_count_after_2x = $urls_count_after
# Download the link's preview image
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1024074310082748416") = "0"
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1024074310082748416") = "0"
initial_link_preview_images_count=$(find link_preview_images | wc -l)
initial_link_preview_images_count=$(find link_preview_images -mindepth 2 | wc -l)
tw download_tweet_content 1024074310082748416
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1024074310082748416") = "1"
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1024074310082748416") = "1"
test $(find link_preview_images | wc -l) = "$((initial_link_preview_images_count + 1))"
test -f link_preview_images/${thumbnail_name}_400x400.jpg
test $(find link_preview_images -mindepth 2 | wc -l) = "$((initial_link_preview_images_count + 1))"
find link_preview_images | grep ${thumbnail_name}_400x400.jpg
# Test a tweet with a URL but no thumbnail

View File

@ -39,6 +39,16 @@ func (d DefaultDownloader) Curl(url string, outpath string) error {
return fmt.Errorf("Error downloading image %s:\n %w", url, err)
}
// Ensure the output directory exists
dirname := path.Dir(outpath)
if dirname != "." {
err = os.Mkdir(dirname, 0755)
if err != nil {
panic(err)
}
}
// Write the downloaded data
err = os.WriteFile(outpath, data, 0644)
if err != nil {
return fmt.Errorf("Error writing to path %s, url %s:\n %w", outpath, url, err)

View File

@ -17,7 +17,8 @@ type Image struct {
}
func ParseAPIMedia(apiMedia APIMedia) Image {
local_filename := path.Base(apiMedia.MediaURLHttps)
local_filename := get_prefixed_path(path.Base(apiMedia.MediaURLHttps))
return Image{
ID: ImageID(apiMedia.ID),
RemoteURL: apiMedia.MediaURLHttps,

View File

@ -26,6 +26,6 @@ func TestParseAPIMedia(t *testing.T) {
assert.Equal("https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg", image.RemoteURL)
assert.Equal(593, image.Width)
assert.Equal(239, image.Height)
assert.Equal("E18sEUrWYAk8dBl.jpg", image.LocalFilename)
assert.Equal("E1/E18sEUrWYAk8dBl.jpg", image.LocalFilename)
assert.False(image.IsDownloaded)
}

View File

@ -60,6 +60,15 @@ func ParseAPIUrlCard(apiCard APICard) Url {
return ret
}
func get_prefixed_path(p string) string {
local_prefix_regex := regexp.MustCompile(`^[\w-]{2}`)
local_prefix := local_prefix_regex.FindString(p)
if len(local_prefix) != 2 {
panic(fmt.Sprintf("Unable to extract a 2-letter prefix for filename %s", p))
}
return path.Join(local_prefix, p)
}
func get_thumbnail_local_path(remote_url string) string {
u, err := url.Parse(remote_url)
if err != nil {
@ -73,7 +82,9 @@ func get_thumbnail_local_path(remote_url string) string {
panic(err)
}
return fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0])
return get_prefixed_path(
fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0]),
)
}
/**

View File

@ -29,7 +29,7 @@ func TestParseAPIUrlCard(t *testing.T) {
assert.Equal(600, url.ThumbnailWidth)
assert.Equal(315, url.ThumbnailHeight)
assert.Equal("https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x600", url.ThumbnailRemoteUrl)
assert.Equal("odDi9EqO_600x600.jpg", url.ThumbnailLocalPath)
assert.Equal("od/odDi9EqO_600x600.jpg", url.ThumbnailLocalPath)
assert.Equal(UserID(155581583), url.CreatorID)
assert.Equal(UserID(16467567), url.SiteID)
assert.True(url.HasThumbnail)
@ -52,7 +52,7 @@ func TestParseAPIUrlCardWithPlayer(t *testing.T) {
assert.Equal("Watch this episode on LBRY/Odysee: https://odysee.com/@capitalresearch:5/the-politically-incorrect-guide-to-the:8"+
"Watch this episode on Rumble: https://rumble...", url.Description)
assert.Equal("https://pbs.twimg.com/card_img/1437849456423194639/_1t0btyt?format=jpg&name=800x320_1", url.ThumbnailRemoteUrl)
assert.Equal("_1t0btyt_800x320_1.jpg", url.ThumbnailLocalPath)
assert.Equal("_1/_1t0btyt_800x320_1.jpg", url.ThumbnailLocalPath)
assert.Equal(UserID(10228272), url.SiteID)
assert.True(url.HasThumbnail)
assert.False(url.IsContentDownloaded)

View File

@ -1,7 +1,7 @@
package scraper
import (
"fmt"
"net/url"
"path"
"sort"
)
@ -28,9 +28,18 @@ type Video struct {
IsGif bool
}
func get_filename(remote_url string) string {
u, err := url.Parse(remote_url)
if err != nil {
panic(err)
}
return path.Base(u.Path)
}
func ParseAPIVideo(apiVideo APIExtendedMedia, tweet_id TweetID) Video {
variants := apiVideo.VideoInfo.Variants
sort.Sort(variants)
video_remote_url := variants[0].URL
var view_count int
@ -51,18 +60,18 @@ func ParseAPIVideo(apiVideo APIExtendedMedia, tweet_id TweetID) Video {
}
}
local_filename := fmt.Sprintf("%d.mp4", tweet_id)
local_filename := get_prefixed_path(get_filename(video_remote_url))
return Video{
ID: VideoID(apiVideo.ID),
TweetID: tweet_id,
Width: apiVideo.OriginalInfo.Width,
Height: apiVideo.OriginalInfo.Height,
RemoteURL: variants[0].URL,
RemoteURL: video_remote_url,
LocalFilename: local_filename,
ThumbnailRemoteUrl: apiVideo.MediaURLHttps,
ThumbnailLocalPath: path.Base(apiVideo.MediaURLHttps),
ThumbnailLocalPath: get_prefixed_path(path.Base(apiVideo.MediaURLHttps)),
Duration: apiVideo.VideoInfo.Duration,
ViewCount: view_count,

View File

@ -28,9 +28,9 @@ func TestParseAPIVideo(t *testing.T) {
assert.Equal(1280, video.Height)
assert.Equal(720, video.Width)
assert.Equal("https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/720x1280/sm4iL9_f8Lclh0aa.mp4?tag=12", video.RemoteURL)
assert.Equal("28.mp4", video.LocalFilename)
assert.Equal("sm/sm4iL9_f8Lclh0aa.mp4", video.LocalFilename)
assert.Equal("https://pbs.twimg.com/ext_tw_video_thumb/1418951950020845568/pu/img/eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailRemoteUrl)
assert.Equal("eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailLocalPath)
assert.Equal("eU/eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailLocalPath)
assert.Equal(275952, video.ViewCount)
assert.Equal(88300, video.Duration)
assert.False(video.IsDownloaded)