diff --git a/persistence/schema.sql b/persistence/schema.sql index 575a146..545293e 100644 --- a/persistence/schema.sql +++ b/persistence/schema.sql @@ -49,6 +49,8 @@ create table tweets (rowid integer primary key, is_stub boolean default 0, is_content_downloaded boolean default 0, + is_conversation_scraped boolean default 0, + last_scraped_at integer not null default 0, foreign key(user_id) references users(id) ); diff --git a/persistence/tweet_queries.go b/persistence/tweet_queries.go index 69e6676..e37401d 100644 --- a/persistence/tweet_queries.go +++ b/persistence/tweet_queries.go @@ -16,18 +16,20 @@ func (p Profile) SaveTweet(t scraper.Tweet) error { return err } _, err = db.Exec(` - insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, quoted_tweet_id, mentions, reply_mentions, hashtags, tombstone_type, is_stub, is_content_downloaded) - values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, (select rowid from tombstone_types where short_name=?), ?, ?) + insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, quoted_tweet_id, mentions, reply_mentions, hashtags, tombstone_type, is_stub, is_content_downloaded, is_conversation_scraped, last_scraped_at) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, (select rowid from tombstone_types where short_name=?), ?, ?, ?, ?) on conflict do update set num_likes=?, num_retweets=?, num_replies=?, num_quote_tweets=?, is_stub=(is_stub and ?), - is_content_downloaded=(is_content_downloaded or ?) + is_content_downloaded=(is_content_downloaded or ?), + is_conversation_scraped=(is_conversation_scraped or ?), + last_scraped_at=max(last_scraped_at, ?) `, - t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyToID, t.QuotedTweetID, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions), strings.Join(t.Hashtags, ","), t.TombstoneType, t.IsStub, t.IsContentDownloaded, - t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsStub, t.IsContentDownloaded, + t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyToID, t.QuotedTweetID, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions), strings.Join(t.Hashtags, ","), t.TombstoneType, t.IsStub, t.IsContentDownloaded, t.IsConversationScraped, t.LastScrapedAt.Unix(), + t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsStub, t.IsContentDownloaded, t.IsConversationScraped, t.LastScrapedAt.Unix(), ) if err != nil { @@ -90,7 +92,7 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) { db := p.DB stmt, err := db.Prepare(` - select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, quoted_tweet_id, mentions, reply_mentions, hashtags, ifnull(tombstone_types.short_name, ""), is_stub, is_content_downloaded + select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, quoted_tweet_id, mentions, reply_mentions, hashtags, ifnull(tombstone_types.short_name, ""), is_stub, is_content_downloaded, is_conversation_scraped, last_scraped_at from tweets left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid where id = ? `) @@ -102,17 +104,20 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) { var t scraper.Tweet var postedAt int + var last_scraped_at int var mentions string var reply_mentions string var hashtags string row := stmt.QueryRow(id) - err = row.Scan(&t.ID, &t.UserID, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyToID, &t.QuotedTweetID, &mentions, &reply_mentions, &hashtags, &t.TombstoneType, &t.IsStub, &t.IsContentDownloaded) + err = row.Scan(&t.ID, &t.UserID, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyToID, &t.QuotedTweetID, &mentions, &reply_mentions, &hashtags, &t.TombstoneType, &t.IsStub, &t.IsContentDownloaded, &t.IsConversationScraped, &last_scraped_at) if err != nil { return t, err } t.PostedAt = time.Unix(int64(postedAt), 0) // args are `seconds` and `nanoseconds` + t.LastScrapedAt = time.Unix(int64(last_scraped_at), 0) + t.Mentions = []scraper.UserHandle{} for _, m := range strings.Split(mentions, ",") { if m != "" { diff --git a/persistence/tweet_queries_test.go b/persistence/tweet_queries_test.go index 81c4db6..8c7e13c 100644 --- a/persistence/tweet_queries_test.go +++ b/persistence/tweet_queries_test.go @@ -2,6 +2,7 @@ package persistence_test import ( "testing" + "time" "github.com/go-test/deep" ) @@ -75,6 +76,8 @@ func TestNoWorseningTweet(t *testing.T) { tweet := create_dummy_tweet() tweet.IsContentDownloaded = true tweet.IsStub = false + tweet.IsConversationScraped = true + tweet.LastScrapedAt = time.Unix(1000, 0) // Save the tweet err := profile.SaveTweet(tweet) @@ -85,6 +88,8 @@ func TestNoWorseningTweet(t *testing.T) { // Worsen the tweet and re-save it tweet.IsContentDownloaded = false tweet.IsStub = true + tweet.IsConversationScraped = false + tweet.LastScrapedAt = time.Unix(500, 0) err = profile.SaveTweet(tweet) if err != nil { t.Fatalf("Failed to save the tweet: %s", err.Error()) @@ -102,6 +107,77 @@ func TestNoWorseningTweet(t *testing.T) { if new_tweet.IsContentDownloaded != true { t.Errorf("Should have preserved is-content-downloaded status") } + if new_tweet.IsConversationScraped == false { + t.Errorf("Should have preserved is-conversation-scraped status") + } + if new_tweet.LastScrapedAt.Unix() != 1000 { + t.Errorf("Should have preserved last-scraped-at time") + } +} + +func TestModifyTweet(t *testing.T) { + profile_path := "test_profiles/TestTweetQueries" + profile := create_or_load_profile(profile_path) + + tweet := create_dummy_tweet() + tweet.NumLikes = 1000 + tweet.NumRetweets = 2000 + tweet.NumReplies = 3000 + tweet.NumQuoteTweets = 4000 + tweet.IsStub = true + tweet.IsContentDownloaded = false + tweet.IsConversationScraped = false + tweet.LastScrapedAt = time.Unix(1000, 0) + + err := profile.SaveTweet(tweet) + if err != nil { + t.Fatalf("Failed to save the tweet: %s", err.Error()) + } + + tweet.NumLikes = 1500 + tweet.NumRetweets = 2500 + tweet.NumReplies = 3500 + tweet.NumQuoteTweets = 4500 + tweet.IsStub = false + tweet.IsContentDownloaded = true + tweet.IsConversationScraped = true + tweet.LastScrapedAt = time.Unix(2000, 0) + + err = profile.SaveTweet(tweet) + if err != nil { + t.Fatalf("Failed to re-save the tweet: %s", err.Error()) + } + + // Reload the tweet + new_tweet, err := profile.GetTweetById(tweet.ID) + if err != nil { + t.Fatalf("Failed to load the tweet: %s", err.Error()) + } + + if new_tweet.NumLikes != 1500 { + t.Errorf("Expected %d likes, got %d", 1500, new_tweet.NumLikes) + } + if new_tweet.NumRetweets != 2500 { + t.Errorf("Expected %d retweets, got %d", 2500, new_tweet.NumRetweets) + } + if new_tweet.NumReplies != 3500 { + t.Errorf("Expected %d replies, got %d", 1500, new_tweet.NumReplies) + } + if new_tweet.NumQuoteTweets != 4500 { + t.Errorf("Expected %d quote tweets, got %d", 4500, new_tweet.NumQuoteTweets) + } + if new_tweet.IsStub != false { + t.Errorf("Expected tweet to not be a stub, but it was") + } + if new_tweet.IsContentDownloaded != true { + t.Errorf("Expected tweet content to be downloaded, but it wasn't") + } + if new_tweet.IsConversationScraped != true { + t.Errorf("Expected conversation to be scraped, but it wasn't") + } + if new_tweet.LastScrapedAt.Unix() != 2000 { + t.Errorf("Expected tweet to be scraped at %d (unix timestamp), but got %d", 2000, new_tweet.LastScrapedAt.Unix()) + } } /** diff --git a/persistence/utils_test.go b/persistence/utils_test.go index 66ef18e..48bb5de 100644 --- a/persistence/utils_test.go +++ b/persistence/utils_test.go @@ -168,6 +168,8 @@ func create_stable_tweet() scraper.Tweet { Polls: []scraper.Poll{ create_poll_from_id(-1), }, + IsConversationScraped: true, + LastScrapedAt: time.Unix(100000000, 0), } } diff --git a/persistence/versions.go b/persistence/versions.go index bbfcd8f..40b21f5 100644 --- a/persistence/versions.go +++ b/persistence/versions.go @@ -8,7 +8,7 @@ import ( ) -const ENGINE_DATABASE_VERSION = 1 +const ENGINE_DATABASE_VERSION = 2 type VersionMismatchError struct { @@ -48,7 +48,9 @@ var MIGRATIONS = []string{ last_scraped_at integer not null, foreign key(tweet_id) references tweets(id) -);`, + );`, +`alter table tweets add column is_conversation_scraped boolean default 0; + alter table tweets add column last_scraped_at integer not null default 0`, } /** diff --git a/scraper/tweet.go b/scraper/tweet.go index bac5dca..74f21f2 100644 --- a/scraper/tweet.go +++ b/scraper/tweet.go @@ -37,6 +37,8 @@ type Tweet struct { IsStub bool IsContentDownloaded bool + IsConversationScraped bool + LastScrapedAt time.Time }