Make videos their own table; store as a slice on Tweet rather than text field

This commit is contained in:
Alessio 2021-07-25 15:42:43 -07:00
parent 647dd8aa6b
commit 0c87428c44
6 changed files with 62 additions and 15 deletions

View File

@ -14,26 +14,28 @@ create table users (rowid integer primary key,
is_verified boolean default 0, is_verified boolean default 0,
profile_image_url text, profile_image_url text,
banner_image_url text, banner_image_url text,
pinned_tweet_id integer check(typeof(pinned_tweet_id) = 'integer' or pinned_tweet_id = '') pinned_tweet_id integer check(typeof(pinned_tweet_id) = 'integer' or pinned_tweet_id = ''),
is_content_downloaded boolean default 0
-- foreign key(pinned_tweet_id) references tweets(id) -- foreign key(pinned_tweet_id) references tweets(id)
); );
create table tweets (rowid integer primary key, create table tweets (rowid integer primary key,
id integer unique not null check(typeof(id) = 'integer'), id integer unique not null check(typeof(id) = 'integer'),
user_id integer not null check(typeof(id) = 'integer'), user_id integer not null check(typeof(user_id) = 'integer'),
text text not null, text text not null,
posted_at integer, posted_at integer,
num_likes integer, num_likes integer,
num_retweets integer, num_retweets integer,
num_replies integer, num_replies integer,
num_quote_tweets integer, num_quote_tweets integer,
video_url text,
in_reply_to integer, in_reply_to integer,
quoted_tweet integer, quoted_tweet integer,
mentions text, -- comma-separated mentions text, -- comma-separated
hashtags text, -- comma-separated hashtags text, -- comma-separated
is_content_downloaded boolean default 0,
foreign key(user_id) references users(id) foreign key(user_id) references users(id)
-- foreign key(in_reply_to) references tweets(id), -- foreign key(in_reply_to) references tweets(id),
-- foreign key(quoted_tweet) references tweets(id) -- foreign key(quoted_tweet) references tweets(id)
@ -59,6 +61,16 @@ create table urls (rowid integer primary key,
create table images (rowid integer primary key, create table images (rowid integer primary key,
tweet_id integer not null, tweet_id integer not null,
filename text not null, filename text not null,
is_downloaded,
unique (tweet_id, filename)
foreign key(tweet_id) references tweets(id)
);
create table videos (rowid integer primary key,
tweet_id integer not null,
filename text not null,
is_downloaded,
unique (tweet_id, filename) unique (tweet_id, filename)
foreign key(tweet_id) references tweets(id) foreign key(tweet_id) references tweets(id)

View File

@ -17,15 +17,15 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
return err return err
} }
_, err = db.Exec(` _, err = db.Exec(`
insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, video_url, in_reply_to, quoted_tweet, mentions, hashtags) insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, hashtags)
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
on conflict do update on conflict do update
set num_likes=?, set num_likes=?,
num_retweets=?, num_retweets=?,
num_replies=?, num_replies=?,
num_quote_tweets=? num_quote_tweets=?
`, `,
t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.Video, t.InReplyTo, t.QuotedTweet, scraper.JoinArrayOfHandles(t.Mentions), strings.Join(t.Hashtags, ","), t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyTo, t.QuotedTweet, scraper.JoinArrayOfHandles(t.Mentions), strings.Join(t.Hashtags, ","),
t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets,
) )
@ -44,12 +44,19 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
return err return err
} }
} }
for _, video := range t.Videos {
_, err := db.Exec("insert into videos (tweet_id, filename) values (?, ?) on conflict do nothing", t.ID, video)
if err != nil {
return err
}
}
for _, hashtag := range t.Hashtags { for _, hashtag := range t.Hashtags {
_, err := db.Exec("insert into hashtags (tweet_id, text) values (?, ?) on conflict do nothing", t.ID, hashtag) _, err := db.Exec("insert into hashtags (tweet_id, text) values (?, ?) on conflict do nothing", t.ID, hashtag)
if err != nil { if err != nil {
return err return err
} }
} }
err = tx.Commit() err = tx.Commit()
if err != nil { if err != nil {
return err return err
@ -96,6 +103,30 @@ func (p Profile) attach_images(t *scraper.Tweet) error {
return nil return nil
} }
func (p Profile) attach_videos(t *scraper.Tweet) error {
println("Attaching videos")
stmt, err := p.DB.Prepare("select filename from videos where tweet_id = ?")
if err != nil {
return err
}
defer stmt.Close()
rows, err := stmt.Query(t.ID)
if err != nil {
return err
}
var video string
for rows.Next() {
err = rows.Scan(&video)
if err != nil {
return err
}
println(video)
t.Videos = append(t.Videos, video)
fmt.Printf("%v\n", t.Videos)
}
return nil
}
func (p Profile) attach_urls(t *scraper.Tweet) error { func (p Profile) attach_urls(t *scraper.Tweet) error {
println("Attaching urls") println("Attaching urls")
stmt, err := p.DB.Prepare("select text from urls where tweet_id = ?") stmt, err := p.DB.Prepare("select text from urls where tweet_id = ?")
@ -124,7 +155,7 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
db := p.DB db := p.DB
stmt, err := db.Prepare(` stmt, err := db.Prepare(`
select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, video_url, in_reply_to, quoted_tweet, mentions, hashtags select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, hashtags
from tweets from tweets
where id = ? where id = ?
`) `)
@ -142,7 +173,7 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
var user_id int64 var user_id int64
row := stmt.QueryRow(id) row := stmt.QueryRow(id)
err = row.Scan(&tweet_id, &user_id, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.Video, &t.InReplyTo, &t.QuotedTweet, &mentions, &hashtags) err = row.Scan(&tweet_id, &user_id, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyTo, &t.QuotedTweet, &mentions, &hashtags)
if err != nil { if err != nil {
return t, err return t, err
} }
@ -159,6 +190,10 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
if err != nil { if err != nil {
return t, err return t, err
} }
err = p.attach_videos(&t)
if err != nil {
return t, err
}
err = p.attach_urls(&t) err = p.attach_urls(&t)
return t, err return t, err
} }

View File

@ -28,13 +28,13 @@ func TestSaveAndLoadTweet(t *testing.T) {
// Save the tweet // Save the tweet
err = profile.SaveTweet(tweet) err = profile.SaveTweet(tweet)
if err != nil { if err != nil {
t.Errorf("Failed to save the tweet: %s", err.Error()) t.Fatalf("Failed to save the tweet: %s", err.Error())
} }
// Reload the tweet // Reload the tweet
new_tweet, err := profile.GetTweetById(tweet.ID) new_tweet, err := profile.GetTweetById(tweet.ID)
if err != nil { if err != nil {
t.Errorf("Failed to load the tweet: %s", err.Error()) t.Fatalf("Failed to load the tweet: %s", err.Error())
} }
if diff := deep.Equal(tweet, new_tweet); diff != nil { if diff := deep.Equal(tweet, new_tweet); diff != nil {

View File

@ -69,7 +69,7 @@ func create_dummy_tweet() scraper.Tweet {
NumRetweets: 2, NumRetweets: 2,
NumReplies: 3, NumReplies: 3,
NumQuoteTweets: 4, NumQuoteTweets: 4,
Video: "video", Videos: []string{"video"},
Urls: []string{"url1", "url2"}, Urls: []string{"url1", "url2"},
Images: []string{"image1", "image2"}, Images: []string{"image1", "image2"},
Mentions: []scraper.UserHandle{"mention1", "mention2"}, Mentions: []scraper.UserHandle{"mention1", "mention2"},

View File

@ -22,11 +22,11 @@ type Tweet struct {
NumRetweets int NumRetweets int
NumReplies int NumReplies int
NumQuoteTweets int NumQuoteTweets int
Video string
InReplyTo TweetID InReplyTo TweetID
Urls []string Urls []string
Images []string Images []string
Videos []string
Mentions []UserHandle Mentions []UserHandle
Hashtags []string Hashtags []string
QuotedTweet TweetID QuotedTweet TweetID
@ -116,7 +116,7 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
} }
variants := apiTweet.ExtendedEntities.Media[0].VideoInfo.Variants variants := apiTweet.ExtendedEntities.Media[0].VideoInfo.Variants
sort.Sort(variants) sort.Sort(variants)
ret.Video = variants[0].URL ret.Videos = []string{variants[0].URL}
ret.Images = []string{} ret.Images = []string{}
} }
return return

View File

@ -153,8 +153,8 @@ func TestParseTweetWithVideo(t *testing.T) {
t.Errorf(err.Error()) t.Errorf(err.Error())
} }
expected_video := "https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/720x1280/sm4iL9_f8Lclh0aa.mp4?tag=12" expected_video := "https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/720x1280/sm4iL9_f8Lclh0aa.mp4?tag=12"
if tweet.Video != expected_video { if len(tweet.Videos) != 1 || tweet.Videos[0] != expected_video {
t.Errorf("Expected video %q, but got %q", expected_video, tweet.Video) t.Errorf("Expected video %q, but got %+v", expected_video, tweet.Videos)
} }
if len(tweet.Images) != 0 { if len(tweet.Images) != 0 {