Add Tweet#last_scraped_at and Tweet#is_conversation_scraped fields

This commit is contained in:
Alessio 2021-12-20 14:07:20 -05:00
parent c95ed6c918
commit c1bcd54a11
6 changed files with 98 additions and 9 deletions

View File

@ -49,6 +49,8 @@ create table tweets (rowid integer primary key,
is_stub boolean default 0, is_stub boolean default 0,
is_content_downloaded boolean default 0, is_content_downloaded boolean default 0,
is_conversation_scraped boolean default 0,
last_scraped_at integer not null default 0,
foreign key(user_id) references users(id) foreign key(user_id) references users(id)
); );

View File

@ -16,18 +16,20 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
return err return err
} }
_, err = db.Exec(` _, err = db.Exec(`
insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, quoted_tweet_id, mentions, reply_mentions, hashtags, tombstone_type, is_stub, is_content_downloaded) insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, quoted_tweet_id, mentions, reply_mentions, hashtags, tombstone_type, is_stub, is_content_downloaded, is_conversation_scraped, last_scraped_at)
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, (select rowid from tombstone_types where short_name=?), ?, ?) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, (select rowid from tombstone_types where short_name=?), ?, ?, ?, ?)
on conflict do update on conflict do update
set num_likes=?, set num_likes=?,
num_retweets=?, num_retweets=?,
num_replies=?, num_replies=?,
num_quote_tweets=?, num_quote_tweets=?,
is_stub=(is_stub and ?), is_stub=(is_stub and ?),
is_content_downloaded=(is_content_downloaded or ?) is_content_downloaded=(is_content_downloaded or ?),
is_conversation_scraped=(is_conversation_scraped or ?),
last_scraped_at=max(last_scraped_at, ?)
`, `,
t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyToID, t.QuotedTweetID, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions), strings.Join(t.Hashtags, ","), t.TombstoneType, t.IsStub, t.IsContentDownloaded, t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyToID, t.QuotedTweetID, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions), strings.Join(t.Hashtags, ","), t.TombstoneType, t.IsStub, t.IsContentDownloaded, t.IsConversationScraped, t.LastScrapedAt.Unix(),
t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsStub, t.IsContentDownloaded, t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsStub, t.IsContentDownloaded, t.IsConversationScraped, t.LastScrapedAt.Unix(),
) )
if err != nil { if err != nil {
@ -90,7 +92,7 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
db := p.DB db := p.DB
stmt, err := db.Prepare(` stmt, err := db.Prepare(`
select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, quoted_tweet_id, mentions, reply_mentions, hashtags, ifnull(tombstone_types.short_name, ""), is_stub, is_content_downloaded select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, quoted_tweet_id, mentions, reply_mentions, hashtags, ifnull(tombstone_types.short_name, ""), is_stub, is_content_downloaded, is_conversation_scraped, last_scraped_at
from tweets left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid from tweets left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid
where id = ? where id = ?
`) `)
@ -102,17 +104,20 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
var t scraper.Tweet var t scraper.Tweet
var postedAt int var postedAt int
var last_scraped_at int
var mentions string var mentions string
var reply_mentions string var reply_mentions string
var hashtags string var hashtags string
row := stmt.QueryRow(id) row := stmt.QueryRow(id)
err = row.Scan(&t.ID, &t.UserID, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyToID, &t.QuotedTweetID, &mentions, &reply_mentions, &hashtags, &t.TombstoneType, &t.IsStub, &t.IsContentDownloaded) err = row.Scan(&t.ID, &t.UserID, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyToID, &t.QuotedTweetID, &mentions, &reply_mentions, &hashtags, &t.TombstoneType, &t.IsStub, &t.IsContentDownloaded, &t.IsConversationScraped, &last_scraped_at)
if err != nil { if err != nil {
return t, err return t, err
} }
t.PostedAt = time.Unix(int64(postedAt), 0) // args are `seconds` and `nanoseconds` t.PostedAt = time.Unix(int64(postedAt), 0) // args are `seconds` and `nanoseconds`
t.LastScrapedAt = time.Unix(int64(last_scraped_at), 0)
t.Mentions = []scraper.UserHandle{} t.Mentions = []scraper.UserHandle{}
for _, m := range strings.Split(mentions, ",") { for _, m := range strings.Split(mentions, ",") {
if m != "" { if m != "" {

View File

@ -2,6 +2,7 @@ package persistence_test
import ( import (
"testing" "testing"
"time"
"github.com/go-test/deep" "github.com/go-test/deep"
) )
@ -75,6 +76,8 @@ func TestNoWorseningTweet(t *testing.T) {
tweet := create_dummy_tweet() tweet := create_dummy_tweet()
tweet.IsContentDownloaded = true tweet.IsContentDownloaded = true
tweet.IsStub = false tweet.IsStub = false
tweet.IsConversationScraped = true
tweet.LastScrapedAt = time.Unix(1000, 0)
// Save the tweet // Save the tweet
err := profile.SaveTweet(tweet) err := profile.SaveTweet(tweet)
@ -85,6 +88,8 @@ func TestNoWorseningTweet(t *testing.T) {
// Worsen the tweet and re-save it // Worsen the tweet and re-save it
tweet.IsContentDownloaded = false tweet.IsContentDownloaded = false
tweet.IsStub = true tweet.IsStub = true
tweet.IsConversationScraped = false
tweet.LastScrapedAt = time.Unix(500, 0)
err = profile.SaveTweet(tweet) err = profile.SaveTweet(tweet)
if err != nil { if err != nil {
t.Fatalf("Failed to save the tweet: %s", err.Error()) t.Fatalf("Failed to save the tweet: %s", err.Error())
@ -102,6 +107,77 @@ func TestNoWorseningTweet(t *testing.T) {
if new_tweet.IsContentDownloaded != true { if new_tweet.IsContentDownloaded != true {
t.Errorf("Should have preserved is-content-downloaded status") t.Errorf("Should have preserved is-content-downloaded status")
} }
if new_tweet.IsConversationScraped == false {
t.Errorf("Should have preserved is-conversation-scraped status")
}
if new_tweet.LastScrapedAt.Unix() != 1000 {
t.Errorf("Should have preserved last-scraped-at time")
}
}
func TestModifyTweet(t *testing.T) {
profile_path := "test_profiles/TestTweetQueries"
profile := create_or_load_profile(profile_path)
tweet := create_dummy_tweet()
tweet.NumLikes = 1000
tweet.NumRetweets = 2000
tweet.NumReplies = 3000
tweet.NumQuoteTweets = 4000
tweet.IsStub = true
tweet.IsContentDownloaded = false
tweet.IsConversationScraped = false
tweet.LastScrapedAt = time.Unix(1000, 0)
err := profile.SaveTweet(tweet)
if err != nil {
t.Fatalf("Failed to save the tweet: %s", err.Error())
}
tweet.NumLikes = 1500
tweet.NumRetweets = 2500
tweet.NumReplies = 3500
tweet.NumQuoteTweets = 4500
tweet.IsStub = false
tweet.IsContentDownloaded = true
tweet.IsConversationScraped = true
tweet.LastScrapedAt = time.Unix(2000, 0)
err = profile.SaveTweet(tweet)
if err != nil {
t.Fatalf("Failed to re-save the tweet: %s", err.Error())
}
// Reload the tweet
new_tweet, err := profile.GetTweetById(tweet.ID)
if err != nil {
t.Fatalf("Failed to load the tweet: %s", err.Error())
}
if new_tweet.NumLikes != 1500 {
t.Errorf("Expected %d likes, got %d", 1500, new_tweet.NumLikes)
}
if new_tweet.NumRetweets != 2500 {
t.Errorf("Expected %d retweets, got %d", 2500, new_tweet.NumRetweets)
}
if new_tweet.NumReplies != 3500 {
t.Errorf("Expected %d replies, got %d", 1500, new_tweet.NumReplies)
}
if new_tweet.NumQuoteTweets != 4500 {
t.Errorf("Expected %d quote tweets, got %d", 4500, new_tweet.NumQuoteTweets)
}
if new_tweet.IsStub != false {
t.Errorf("Expected tweet to not be a stub, but it was")
}
if new_tweet.IsContentDownloaded != true {
t.Errorf("Expected tweet content to be downloaded, but it wasn't")
}
if new_tweet.IsConversationScraped != true {
t.Errorf("Expected conversation to be scraped, but it wasn't")
}
if new_tweet.LastScrapedAt.Unix() != 2000 {
t.Errorf("Expected tweet to be scraped at %d (unix timestamp), but got %d", 2000, new_tweet.LastScrapedAt.Unix())
}
} }
/** /**

View File

@ -168,6 +168,8 @@ func create_stable_tweet() scraper.Tweet {
Polls: []scraper.Poll{ Polls: []scraper.Poll{
create_poll_from_id(-1), create_poll_from_id(-1),
}, },
IsConversationScraped: true,
LastScrapedAt: time.Unix(100000000, 0),
} }
} }

View File

@ -8,7 +8,7 @@ import (
) )
const ENGINE_DATABASE_VERSION = 1 const ENGINE_DATABASE_VERSION = 2
type VersionMismatchError struct { type VersionMismatchError struct {
@ -48,7 +48,9 @@ var MIGRATIONS = []string{
last_scraped_at integer not null, last_scraped_at integer not null,
foreign key(tweet_id) references tweets(id) foreign key(tweet_id) references tweets(id)
);`, );`,
`alter table tweets add column is_conversation_scraped boolean default 0;
alter table tweets add column last_scraped_at integer not null default 0`,
} }
/** /**

View File

@ -37,6 +37,8 @@ type Tweet struct {
IsStub bool IsStub bool
IsContentDownloaded bool IsContentDownloaded bool
IsConversationScraped bool
LastScrapedAt time.Time
} }