Put videos, images, and video-thumbnails into subdirectories
This commit is contained in:
parent
6de2d670e8
commit
069ddcd976
24
cmd/tests.sh
24
cmd/tests.sh
@ -44,12 +44,12 @@ test $(sqlite3 twitter.db "select count(*) from images") = "4"
|
||||
test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 0") = "4"
|
||||
test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 1") = "0"
|
||||
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1503016316642689026") = "0"
|
||||
test $(find images | wc -l) = "1"
|
||||
test $(find images -mindepth 2 | wc -l) = "0"
|
||||
tw download_tweet_content https://twitter.com/wrathofgnon/status/1503016316642689026
|
||||
test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 0") = "0"
|
||||
test $(sqlite3 twitter.db "select count(*) from images where tweet_id = 1503016316642689026 and is_downloaded = 1") = "4"
|
||||
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1503016316642689026") = "1"
|
||||
test $(find images | wc -l) = "5"
|
||||
test $(find images -mindepth 2 | wc -l) = "4"
|
||||
|
||||
# Try to double-download it
|
||||
tw fetch_tweet_only https://twitter.com/wrathofgnon/status/1503016316642689026
|
||||
@ -68,14 +68,14 @@ test $(sqlite3 twitter.db "select count(*) from videos") = "1"
|
||||
test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 0") = "1"
|
||||
test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 1") = "0"
|
||||
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1581025285524242432") = "0"
|
||||
test $(find videos | wc -l) = "1"
|
||||
test $(find video_thumbnails | wc -l) = "1"
|
||||
test $(find videos -mindepth 2 | wc -l) = "0"
|
||||
test $(find video_thumbnails -mindepth 2| wc -l) = "0"
|
||||
tw download_tweet_content 1581025285524242432
|
||||
test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 0") = "0"
|
||||
test $(sqlite3 twitter.db "select count(*) from videos where tweet_id = 1581025285524242432 and is_downloaded = 1") = "1"
|
||||
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1581025285524242432") = "1"
|
||||
test $(find videos | wc -l) = "2"
|
||||
test $(find video_thumbnails | wc -l) = "2"
|
||||
test $(find videos -mindepth 2 | wc -l) = "1"
|
||||
test $(find video_thumbnails -mindepth 2 | wc -l) = "1"
|
||||
|
||||
# Try to double-download it
|
||||
tw fetch_tweet_only https://twitter.com/SpaceX/status/1581025285524242432
|
||||
@ -85,7 +85,7 @@ test $(sqlite3 twitter.db "select count(*) from videos") = "1"
|
||||
|
||||
# Fetch a tweet with a GIF
|
||||
tw fetch_user Cernovich
|
||||
initial_videos_count=$(find videos | wc -l)
|
||||
initial_videos_count=$(find videos -mindepth 2 | wc -l) # Don't count prefix dirs
|
||||
initial_videos_db_count=$(sqlite3 twitter.db "select count(*) from videos")
|
||||
tw fetch_tweet_only https://twitter.com/Cernovich/status/1444429517020274693
|
||||
|
||||
@ -93,9 +93,9 @@ test $(sqlite3 twitter.db "select count(*) from videos") = "$((initial_videos_db
|
||||
test $(sqlite3 twitter.db "select is_gif from videos where tweet_id = 1444429517020274693") = "1"
|
||||
|
||||
# Download the GIF
|
||||
test $(find videos | wc -l) = "$((initial_videos_count))" # Shouldn't have changed yet
|
||||
test $(find videos -mindepth 2 | wc -l) = "$((initial_videos_count))" # Shouldn't have changed yet
|
||||
tw download_tweet_content https://twitter.com/Cernovich/status/1444429517020274693
|
||||
test $(find videos | wc -l) = "$((initial_videos_count + 1))"
|
||||
test $(find videos -mindepth 2 | wc -l) = "$((initial_videos_count + 1))"
|
||||
|
||||
|
||||
# Fetch a tweet with a poll
|
||||
@ -189,12 +189,12 @@ test $urls_count_after_2x = $urls_count_after
|
||||
# Download the link's preview image
|
||||
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1024074310082748416") = "0"
|
||||
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1024074310082748416") = "0"
|
||||
initial_link_preview_images_count=$(find link_preview_images | wc -l)
|
||||
initial_link_preview_images_count=$(find link_preview_images -mindepth 2 | wc -l)
|
||||
tw download_tweet_content 1024074310082748416
|
||||
test $(sqlite3 twitter.db "select is_content_downloaded from tweets where id = 1024074310082748416") = "1"
|
||||
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1024074310082748416") = "1"
|
||||
test $(find link_preview_images | wc -l) = "$((initial_link_preview_images_count + 1))"
|
||||
test -f link_preview_images/${thumbnail_name}_400x400.jpg
|
||||
test $(find link_preview_images -mindepth 2 | wc -l) = "$((initial_link_preview_images_count + 1))"
|
||||
find link_preview_images | grep ${thumbnail_name}_400x400.jpg
|
||||
|
||||
|
||||
# Test a tweet with a URL but no thumbnail
|
||||
|
@ -39,6 +39,16 @@ func (d DefaultDownloader) Curl(url string, outpath string) error {
|
||||
return fmt.Errorf("Error downloading image %s:\n %w", url, err)
|
||||
}
|
||||
|
||||
// Ensure the output directory exists
|
||||
dirname := path.Dir(outpath)
|
||||
if dirname != "." {
|
||||
err = os.Mkdir(dirname, 0755)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
// Write the downloaded data
|
||||
err = os.WriteFile(outpath, data, 0644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Error writing to path %s, url %s:\n %w", outpath, url, err)
|
||||
|
@ -17,7 +17,8 @@ type Image struct {
|
||||
}
|
||||
|
||||
func ParseAPIMedia(apiMedia APIMedia) Image {
|
||||
local_filename := path.Base(apiMedia.MediaURLHttps)
|
||||
local_filename := get_prefixed_path(path.Base(apiMedia.MediaURLHttps))
|
||||
|
||||
return Image{
|
||||
ID: ImageID(apiMedia.ID),
|
||||
RemoteURL: apiMedia.MediaURLHttps,
|
||||
|
@ -26,6 +26,6 @@ func TestParseAPIMedia(t *testing.T) {
|
||||
assert.Equal("https://pbs.twimg.com/media/E18sEUrWYAk8dBl.jpg", image.RemoteURL)
|
||||
assert.Equal(593, image.Width)
|
||||
assert.Equal(239, image.Height)
|
||||
assert.Equal("E18sEUrWYAk8dBl.jpg", image.LocalFilename)
|
||||
assert.Equal("E1/E18sEUrWYAk8dBl.jpg", image.LocalFilename)
|
||||
assert.False(image.IsDownloaded)
|
||||
}
|
||||
|
@ -60,6 +60,15 @@ func ParseAPIUrlCard(apiCard APICard) Url {
|
||||
return ret
|
||||
}
|
||||
|
||||
func get_prefixed_path(p string) string {
|
||||
local_prefix_regex := regexp.MustCompile(`^[\w-]{2}`)
|
||||
local_prefix := local_prefix_regex.FindString(p)
|
||||
if len(local_prefix) != 2 {
|
||||
panic(fmt.Sprintf("Unable to extract a 2-letter prefix for filename %s", p))
|
||||
}
|
||||
return path.Join(local_prefix, p)
|
||||
}
|
||||
|
||||
func get_thumbnail_local_path(remote_url string) string {
|
||||
u, err := url.Parse(remote_url)
|
||||
if err != nil {
|
||||
@ -73,7 +82,9 @@ func get_thumbnail_local_path(remote_url string) string {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0])
|
||||
return get_prefixed_path(
|
||||
fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0]),
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -29,7 +29,7 @@ func TestParseAPIUrlCard(t *testing.T) {
|
||||
assert.Equal(600, url.ThumbnailWidth)
|
||||
assert.Equal(315, url.ThumbnailHeight)
|
||||
assert.Equal("https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x600", url.ThumbnailRemoteUrl)
|
||||
assert.Equal("odDi9EqO_600x600.jpg", url.ThumbnailLocalPath)
|
||||
assert.Equal("od/odDi9EqO_600x600.jpg", url.ThumbnailLocalPath)
|
||||
assert.Equal(UserID(155581583), url.CreatorID)
|
||||
assert.Equal(UserID(16467567), url.SiteID)
|
||||
assert.True(url.HasThumbnail)
|
||||
@ -52,7 +52,7 @@ func TestParseAPIUrlCardWithPlayer(t *testing.T) {
|
||||
assert.Equal("Watch this episode on LBRY/Odysee: https://odysee.com/@capitalresearch:5/the-politically-incorrect-guide-to-the:8"+
|
||||
"Watch this episode on Rumble: https://rumble...", url.Description)
|
||||
assert.Equal("https://pbs.twimg.com/card_img/1437849456423194639/_1t0btyt?format=jpg&name=800x320_1", url.ThumbnailRemoteUrl)
|
||||
assert.Equal("_1t0btyt_800x320_1.jpg", url.ThumbnailLocalPath)
|
||||
assert.Equal("_1/_1t0btyt_800x320_1.jpg", url.ThumbnailLocalPath)
|
||||
assert.Equal(UserID(10228272), url.SiteID)
|
||||
assert.True(url.HasThumbnail)
|
||||
assert.False(url.IsContentDownloaded)
|
||||
|
@ -1,7 +1,7 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/url"
|
||||
"path"
|
||||
"sort"
|
||||
)
|
||||
@ -28,9 +28,18 @@ type Video struct {
|
||||
IsGif bool
|
||||
}
|
||||
|
||||
func get_filename(remote_url string) string {
|
||||
u, err := url.Parse(remote_url)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return path.Base(u.Path)
|
||||
}
|
||||
|
||||
func ParseAPIVideo(apiVideo APIExtendedMedia, tweet_id TweetID) Video {
|
||||
variants := apiVideo.VideoInfo.Variants
|
||||
sort.Sort(variants)
|
||||
video_remote_url := variants[0].URL
|
||||
|
||||
var view_count int
|
||||
|
||||
@ -51,18 +60,18 @@ func ParseAPIVideo(apiVideo APIExtendedMedia, tweet_id TweetID) Video {
|
||||
}
|
||||
}
|
||||
|
||||
local_filename := fmt.Sprintf("%d.mp4", tweet_id)
|
||||
local_filename := get_prefixed_path(get_filename(video_remote_url))
|
||||
|
||||
return Video{
|
||||
ID: VideoID(apiVideo.ID),
|
||||
TweetID: tweet_id,
|
||||
Width: apiVideo.OriginalInfo.Width,
|
||||
Height: apiVideo.OriginalInfo.Height,
|
||||
RemoteURL: variants[0].URL,
|
||||
RemoteURL: video_remote_url,
|
||||
LocalFilename: local_filename,
|
||||
|
||||
ThumbnailRemoteUrl: apiVideo.MediaURLHttps,
|
||||
ThumbnailLocalPath: path.Base(apiVideo.MediaURLHttps),
|
||||
ThumbnailLocalPath: get_prefixed_path(path.Base(apiVideo.MediaURLHttps)),
|
||||
Duration: apiVideo.VideoInfo.Duration,
|
||||
ViewCount: view_count,
|
||||
|
||||
|
@ -28,9 +28,9 @@ func TestParseAPIVideo(t *testing.T) {
|
||||
assert.Equal(1280, video.Height)
|
||||
assert.Equal(720, video.Width)
|
||||
assert.Equal("https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/720x1280/sm4iL9_f8Lclh0aa.mp4?tag=12", video.RemoteURL)
|
||||
assert.Equal("28.mp4", video.LocalFilename)
|
||||
assert.Equal("sm/sm4iL9_f8Lclh0aa.mp4", video.LocalFilename)
|
||||
assert.Equal("https://pbs.twimg.com/ext_tw_video_thumb/1418951950020845568/pu/img/eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailRemoteUrl)
|
||||
assert.Equal("eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailLocalPath)
|
||||
assert.Equal("eU/eUTaYYfuAJ8FyjUi.jpg", video.ThumbnailLocalPath)
|
||||
assert.Equal(275952, video.ViewCount)
|
||||
assert.Equal(88300, video.Duration)
|
||||
assert.False(video.IsDownloaded)
|
||||
|
Loading…
x
Reference in New Issue
Block a user