From 0c87428c44cf7473436bd308caa4320b960127c3 Mon Sep 17 00:00:00 2001 From: Alessio Date: Sun, 25 Jul 2021 15:42:43 -0700 Subject: [PATCH] Make videos their own table; store as a slice on Tweet rather than text field --- persistence/schema.sql | 18 ++++++++++--- persistence/tweet_queries.go | 45 +++++++++++++++++++++++++++---- persistence/tweet_queries_test.go | 4 +-- persistence/utils_test.go | 2 +- scraper/tweet.go | 4 +-- scraper/tweet_test.go | 4 +-- 6 files changed, 62 insertions(+), 15 deletions(-) diff --git a/persistence/schema.sql b/persistence/schema.sql index f4355ad..28a594d 100644 --- a/persistence/schema.sql +++ b/persistence/schema.sql @@ -14,26 +14,28 @@ create table users (rowid integer primary key, is_verified boolean default 0, profile_image_url text, banner_image_url text, - pinned_tweet_id integer check(typeof(pinned_tweet_id) = 'integer' or pinned_tweet_id = '') + pinned_tweet_id integer check(typeof(pinned_tweet_id) = 'integer' or pinned_tweet_id = ''), + + is_content_downloaded boolean default 0 -- foreign key(pinned_tweet_id) references tweets(id) ); create table tweets (rowid integer primary key, id integer unique not null check(typeof(id) = 'integer'), - user_id integer not null check(typeof(id) = 'integer'), + user_id integer not null check(typeof(user_id) = 'integer'), text text not null, posted_at integer, num_likes integer, num_retweets integer, num_replies integer, num_quote_tweets integer, - video_url text, in_reply_to integer, quoted_tweet integer, mentions text, -- comma-separated hashtags text, -- comma-separated + is_content_downloaded boolean default 0, foreign key(user_id) references users(id) -- foreign key(in_reply_to) references tweets(id), -- foreign key(quoted_tweet) references tweets(id) @@ -59,6 +61,16 @@ create table urls (rowid integer primary key, create table images (rowid integer primary key, tweet_id integer not null, filename text not null, + is_downloaded, + + unique (tweet_id, filename) + foreign key(tweet_id) references tweets(id) +); + +create table videos (rowid integer primary key, + tweet_id integer not null, + filename text not null, + is_downloaded, unique (tweet_id, filename) foreign key(tweet_id) references tweets(id) diff --git a/persistence/tweet_queries.go b/persistence/tweet_queries.go index 7049b02..a961623 100644 --- a/persistence/tweet_queries.go +++ b/persistence/tweet_queries.go @@ -17,15 +17,15 @@ func (p Profile) SaveTweet(t scraper.Tweet) error { return err } _, err = db.Exec(` - insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, video_url, in_reply_to, quoted_tweet, mentions, hashtags) - values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, hashtags) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) on conflict do update set num_likes=?, num_retweets=?, num_replies=?, num_quote_tweets=? `, - t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.Video, t.InReplyTo, t.QuotedTweet, scraper.JoinArrayOfHandles(t.Mentions), strings.Join(t.Hashtags, ","), + t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyTo, t.QuotedTweet, scraper.JoinArrayOfHandles(t.Mentions), strings.Join(t.Hashtags, ","), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, ) @@ -44,12 +44,19 @@ func (p Profile) SaveTweet(t scraper.Tweet) error { return err } } + for _, video := range t.Videos { + _, err := db.Exec("insert into videos (tweet_id, filename) values (?, ?) on conflict do nothing", t.ID, video) + if err != nil { + return err + } + } for _, hashtag := range t.Hashtags { _, err := db.Exec("insert into hashtags (tweet_id, text) values (?, ?) on conflict do nothing", t.ID, hashtag) if err != nil { return err } } + err = tx.Commit() if err != nil { return err @@ -96,6 +103,30 @@ func (p Profile) attach_images(t *scraper.Tweet) error { return nil } +func (p Profile) attach_videos(t *scraper.Tweet) error { + println("Attaching videos") + stmt, err := p.DB.Prepare("select filename from videos where tweet_id = ?") + if err != nil { + return err + } + defer stmt.Close() + rows, err := stmt.Query(t.ID) + if err != nil { + return err + } + var video string + for rows.Next() { + err = rows.Scan(&video) + if err != nil { + return err + } + println(video) + t.Videos = append(t.Videos, video) + fmt.Printf("%v\n", t.Videos) + } + return nil +} + func (p Profile) attach_urls(t *scraper.Tweet) error { println("Attaching urls") stmt, err := p.DB.Prepare("select text from urls where tweet_id = ?") @@ -124,7 +155,7 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) { db := p.DB stmt, err := db.Prepare(` - select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, video_url, in_reply_to, quoted_tweet, mentions, hashtags + select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, hashtags from tweets where id = ? `) @@ -142,7 +173,7 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) { var user_id int64 row := stmt.QueryRow(id) - err = row.Scan(&tweet_id, &user_id, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.Video, &t.InReplyTo, &t.QuotedTweet, &mentions, &hashtags) + err = row.Scan(&tweet_id, &user_id, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyTo, &t.QuotedTweet, &mentions, &hashtags) if err != nil { return t, err } @@ -159,6 +190,10 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) { if err != nil { return t, err } + err = p.attach_videos(&t) + if err != nil { + return t, err + } err = p.attach_urls(&t) return t, err } diff --git a/persistence/tweet_queries_test.go b/persistence/tweet_queries_test.go index 5034f36..7649e6f 100644 --- a/persistence/tweet_queries_test.go +++ b/persistence/tweet_queries_test.go @@ -28,13 +28,13 @@ func TestSaveAndLoadTweet(t *testing.T) { // Save the tweet err = profile.SaveTweet(tweet) if err != nil { - t.Errorf("Failed to save the tweet: %s", err.Error()) + t.Fatalf("Failed to save the tweet: %s", err.Error()) } // Reload the tweet new_tweet, err := profile.GetTweetById(tweet.ID) if err != nil { - t.Errorf("Failed to load the tweet: %s", err.Error()) + t.Fatalf("Failed to load the tweet: %s", err.Error()) } if diff := deep.Equal(tweet, new_tweet); diff != nil { diff --git a/persistence/utils_test.go b/persistence/utils_test.go index 1585f4f..2163614 100644 --- a/persistence/utils_test.go +++ b/persistence/utils_test.go @@ -69,7 +69,7 @@ func create_dummy_tweet() scraper.Tweet { NumRetweets: 2, NumReplies: 3, NumQuoteTweets: 4, - Video: "video", + Videos: []string{"video"}, Urls: []string{"url1", "url2"}, Images: []string{"image1", "image2"}, Mentions: []scraper.UserHandle{"mention1", "mention2"}, diff --git a/scraper/tweet.go b/scraper/tweet.go index 5665232..b0a1d1d 100644 --- a/scraper/tweet.go +++ b/scraper/tweet.go @@ -22,11 +22,11 @@ type Tweet struct { NumRetweets int NumReplies int NumQuoteTweets int - Video string InReplyTo TweetID Urls []string Images []string + Videos []string Mentions []UserHandle Hashtags []string QuotedTweet TweetID @@ -116,7 +116,7 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { } variants := apiTweet.ExtendedEntities.Media[0].VideoInfo.Variants sort.Sort(variants) - ret.Video = variants[0].URL + ret.Videos = []string{variants[0].URL} ret.Images = []string{} } return diff --git a/scraper/tweet_test.go b/scraper/tweet_test.go index f77b66b..63e64ea 100644 --- a/scraper/tweet_test.go +++ b/scraper/tweet_test.go @@ -153,8 +153,8 @@ func TestParseTweetWithVideo(t *testing.T) { t.Errorf(err.Error()) } expected_video := "https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/720x1280/sm4iL9_f8Lclh0aa.mp4?tag=12" - if tweet.Video != expected_video { - t.Errorf("Expected video %q, but got %q", expected_video, tweet.Video) + if len(tweet.Videos) != 1 || tweet.Videos[0] != expected_video { + t.Errorf("Expected video %q, but got %+v", expected_video, tweet.Videos) } if len(tweet.Images) != 0 {