diff --git a/persistence/tweet_queries.go b/persistence/tweet_queries.go index c1eae53..3967f91 100644 --- a/persistence/tweet_queries.go +++ b/persistence/tweet_queries.go @@ -4,7 +4,6 @@ import ( "database/sql" "errors" "fmt" - "strings" "offline_twitter/scraper" ) @@ -22,42 +21,36 @@ func (p Profile) SaveTweet(t scraper.Tweet) error { } } - _, err := db.Exec(` + _, err := db.NamedExec(` insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, quoted_tweet_id, mentions, reply_mentions, hashtags, space_id, tombstone_type, is_stub, is_content_downloaded, is_conversation_scraped, last_scraped_at) - values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, nullif(?, ''), (select rowid from tombstone_types where short_name=?), ?, ?, ?, ?) + values (:id, :user_id, :text, :posted_at, :num_likes, :num_retweets, :num_replies, :num_quote_tweets, :in_reply_to_id, :quoted_tweet_id, :mentions, :reply_mentions, :hashtags, nullif(:space_id, ''), (select rowid from tombstone_types where short_name=:tombstone_type), :is_stub, :is_content_downloaded, :is_conversation_scraped, :last_scraped_at) on conflict do update set text=(case when is_stub then - ? + :text else text end ), - num_likes=?, - num_retweets=?, - num_replies=?, - num_quote_tweets=?, - is_stub=(is_stub and ?), + num_likes=(case when :is_stub then num_likes else :num_likes end), + num_retweets=(case when :is_stub then num_retweets else :num_retweets end), + num_replies=(case when :is_stub then num_replies else :num_replies end), + num_quote_tweets=(case when :is_stub then num_quote_tweets else :num_quote_tweets end), + is_stub=(is_stub and :is_stub), tombstone_type=(case - when ?='unavailable' and tombstone_type not in (0, 4) then + when :tombstone_type='unavailable' and tombstone_type not in (0, 4) then tombstone_type else - (select rowid from tombstone_types where short_name=?) + (select rowid from tombstone_types where short_name=:tombstone_type) end ), - is_content_downloaded=(is_content_downloaded or ?), - is_conversation_scraped=(is_conversation_scraped or ?), - last_scraped_at=max(last_scraped_at, ?) + is_content_downloaded=(is_content_downloaded or :is_content_downloaded), + is_conversation_scraped=(is_conversation_scraped or :is_conversation_scraped), + last_scraped_at=max(last_scraped_at, :last_scraped_at) `, - t.ID, t.UserID, t.Text, t.PostedAt, t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyToID, - t.QuotedTweetID, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions), - strings.Join(t.Hashtags, ","), t.SpaceID, t.TombstoneType, t.IsStub, t.IsContentDownloaded, t.IsConversationScraped, - t.LastScrapedAt, - - t.Text, t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsStub, t.TombstoneType, t.TombstoneType, - t.IsContentDownloaded, t.IsConversationScraped, t.LastScrapedAt, + t, ) if err != nil { @@ -119,82 +112,53 @@ func (p Profile) IsTweetInDatabase(id scraper.TweetID) bool { func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) { db := p.DB - stmt, err := db.Prepare(` + var t scraper.Tweet + err := db.Get(&t, ` select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, quoted_tweet_id, - mentions, reply_mentions, hashtags, ifnull(space_id, ''), ifnull(tombstone_types.short_name, ""), is_stub, + mentions, reply_mentions, hashtags, ifnull(space_id, '') space_id, ifnull(tombstone_types.short_name, "") tombstone_type, is_stub, is_content_downloaded, is_conversation_scraped, last_scraped_at from tweets left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid where id = ? - `) + `, id) if err != nil { - return scraper.Tweet{}, fmt.Errorf("Error preparing statement in GetTweetByID(%d):\n %w", id, err) - } - defer stmt.Close() - - var t scraper.Tweet - var mentions string - var reply_mentions string - var hashtags string - - row := stmt.QueryRow(id) - err = row.Scan(&t.ID, &t.UserID, &t.Text, &t.PostedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyToID, - &t.QuotedTweetID, &mentions, &reply_mentions, &hashtags, &t.SpaceID, &t.TombstoneType, &t.IsStub, &t.IsContentDownloaded, - &t.IsConversationScraped, &t.LastScrapedAt) - if err != nil { - return t, fmt.Errorf("Error parsing result in GetTweetByID(%d):\n %w", id, err) - } - - t.Mentions = []scraper.UserHandle{} - for _, m := range strings.Split(mentions, ",") { - if m != "" { - t.Mentions = append(t.Mentions, scraper.UserHandle(m)) - } - } - t.ReplyMentions = []scraper.UserHandle{} - for _, m := range strings.Split(reply_mentions, ",") { - if m != "" { - t.ReplyMentions = append(t.ReplyMentions, scraper.UserHandle(m)) - } - } - t.Hashtags = []string{} - for _, h := range strings.Split(hashtags, ",") { - if h != "" { - t.Hashtags = append(t.Hashtags, h) - } + return scraper.Tweet{}, fmt.Errorf("Error executing GetTweetByID(%d):\n %w", id, err) } t.Spaces = []scraper.Space{} if t.SpaceID != "" { space, err := p.GetSpaceById(t.SpaceID) if err != nil { - return t, err + return t, fmt.Errorf("Error retrieving space with ID %s (tweet %d):\n %w", t.SpaceID, t.ID, err) } t.Spaces = append(t.Spaces, space) } imgs, err := p.GetImagesForTweet(t) if err != nil { - return t, err + return t, fmt.Errorf("Error retrieving images for tweet %d:\n %w", t.ID, err) } t.Images = imgs vids, err := p.GetVideosForTweet(t) if err != nil { - return t, err + return t, fmt.Errorf("Error retrieving videos for tweet %d:\n %w", t.ID, err) } t.Videos = vids polls, err := p.GetPollsForTweet(t) if err != nil { - return t, err + return t, fmt.Errorf("Error retrieving polls for tweet %d:\n %w", t.ID, err) } t.Polls = polls urls, err := p.GetUrlsForTweet(t) + if err != nil { + return t, fmt.Errorf("Error retrieving urls for tweet %d:\n %w", t.ID, err) + } t.Urls = urls - return t, err + return t, nil } /** diff --git a/persistence/tweet_queries_test.go b/persistence/tweet_queries_test.go index dc91eac..df537ca 100644 --- a/persistence/tweet_queries_test.go +++ b/persistence/tweet_queries_test.go @@ -77,6 +77,10 @@ func TestNoWorseningTweet(t *testing.T) { tweet.IsConversationScraped = true tweet.LastScrapedAt = scraper.TimestampFromUnix(1000) tweet.Text = "Yes text" + tweet.NumLikes = 10 + tweet.NumRetweets = 11 + tweet.NumQuoteTweets = 12 + tweet.NumReplies = 13 // Save the tweet err := profile.SaveTweet(tweet) @@ -90,6 +94,10 @@ func TestNoWorseningTweet(t *testing.T) { tweet.Text = "" err = profile.SaveTweet(tweet) require.NoError(err) + tweet.NumLikes = 0 + tweet.NumRetweets = 0 + tweet.NumQuoteTweets = 0 + tweet.NumReplies = 0 // Reload the tweet new_tweet, err := profile.GetTweetById(tweet.ID) @@ -100,6 +108,10 @@ func TestNoWorseningTweet(t *testing.T) { assert.True(new_tweet.IsConversationScraped, "Should have preserved is-conversation-scraped status") assert.Equal(int64(1000), new_tweet.LastScrapedAt.Unix(), "Should have preserved last-scraped-at time") assert.Equal(new_tweet.Text, "Yes text", "Text should not get clobbered if it becomes unavailable") + assert.Equal(10, new_tweet.NumLikes) + assert.Equal(11, new_tweet.NumRetweets) + assert.Equal(12, new_tweet.NumQuoteTweets) + assert.Equal(13, new_tweet.NumReplies) } /** diff --git a/persistence/utils_test.go b/persistence/utils_test.go index 05fd4fc..176fc70 100644 --- a/persistence/utils_test.go +++ b/persistence/utils_test.go @@ -166,8 +166,8 @@ func create_stable_tweet() scraper.Tweet { Images: []scraper.Image{ create_image_from_id(-1), }, - Mentions: []scraper.UserHandle{}, - Hashtags: []string{}, + Mentions: scraper.CommaSeparatedList{}, + Hashtags: scraper.CommaSeparatedList{}, Polls: []scraper.Poll{ create_poll_from_id(-1), }, @@ -257,9 +257,9 @@ func create_dummy_tweet() scraper.Tweet { Videos: []scraper.Video{vid}, Urls: []scraper.Url{url1, url2}, Images: []scraper.Image{img1, img2}, - Mentions: []scraper.UserHandle{"mention1", "mention2"}, - ReplyMentions: []scraper.UserHandle{"replymention1", "replymention2"}, - Hashtags: []string{"hash1", "hash2"}, + Mentions: scraper.CommaSeparatedList{"mention1", "mention2"}, + ReplyMentions: scraper.CommaSeparatedList{"replymention1", "replymention2"}, + Hashtags: scraper.CommaSeparatedList{"hash1", "hash2"}, Polls: []scraper.Poll{poll}, Spaces: []scraper.Space{space}, SpaceID: space_id, @@ -278,9 +278,9 @@ func create_dummy_tombstone() scraper.Tweet { UserID: -1, TombstoneType: "deleted", IsStub: true, - Mentions: []scraper.UserHandle{}, - ReplyMentions: []scraper.UserHandle{}, - Hashtags: []string{}, + Mentions: scraper.CommaSeparatedList{}, + ReplyMentions: scraper.CommaSeparatedList{}, + Hashtags: scraper.CommaSeparatedList{}, Spaces: []scraper.Space{}, } } diff --git a/scraper/api_types_v2_test.go b/scraper/api_types_v2_test.go index c8ce01f..ebeec9d 100644 --- a/scraper/api_types_v2_test.go +++ b/scraper/api_types_v2_test.go @@ -125,7 +125,7 @@ func TestAPIV2ParseTweetWithQuotedTweet(t *testing.T) { assert.Equal(TweetID(1485689207435710464), quoted_tweet.InReplyToID) assert.Equal(TweetID(0), quoted_tweet.QuotedTweetID) assert.Equal(1, len(quoted_tweet.ReplyMentions)) - assert.Contains(quoted_tweet.ReplyMentions, UserHandle("michaelmalice")) + assert.Contains(quoted_tweet.ReplyMentions, "michaelmalice") assert.Equal(1, quoted_tweet.NumReplies) assert.Equal(12, quoted_tweet.NumLikes) diff --git a/scraper/tweet.go b/scraper/tweet.go index eb7ec85..27f9946 100644 --- a/scraper/tweet.go +++ b/scraper/tweet.go @@ -1,6 +1,7 @@ package scraper import ( + "database/sql/driver" "fmt" log "github.com/sirupsen/logrus" "strings" @@ -9,44 +10,64 @@ import ( "offline_twitter/terminal_utils" ) -const DEFAULT_MAX_REPLIES_EAGER_LOAD = 50 type TweetID int64 +type CommaSeparatedList []string + +func (l *CommaSeparatedList) Scan(src interface{}) error { + *l = CommaSeparatedList{} + switch src.(type) { + case string: + for _, v := range strings.Split(src.(string), ",") { + if v != "" { + *l = append(*l, v) + } + } + default: + panic("Should be a string") + } + return nil +} +func (l CommaSeparatedList) Value() (driver.Value, error) { + return strings.Join(l, ","), nil +} + type Tweet struct { - ID TweetID - UserID UserID + ID TweetID `db:"id"` + UserID UserID `db:"user_id"` UserHandle UserHandle // For processing tombstones User *User - Text string - PostedAt Timestamp - NumLikes int - NumRetweets int - NumReplies int - NumQuoteTweets int - InReplyToID TweetID - QuotedTweetID TweetID + Text string `db:"text"` + IsExpandable bool `db:"is_expandable"` + PostedAt Timestamp `db:"posted_at"` + NumLikes int `db:"num_likes"` + NumRetweets int `db:"num_retweets"` + NumReplies int `db:"num_replies"` + NumQuoteTweets int `db:"num_quote_tweets"` + InReplyToID TweetID `db:"in_reply_to_id"` + QuotedTweetID TweetID `db:"quoted_tweet_id"` Images []Image Videos []Video - Mentions []UserHandle - ReplyMentions []UserHandle - Hashtags []string Urls []Url Polls []Poll + Mentions CommaSeparatedList `db:"mentions"` + ReplyMentions CommaSeparatedList `db:"reply_mentions"` + Hashtags CommaSeparatedList `db:"hashtags"` // TODO get-rid-of-spaces: Might be good to get rid of `Spaces`. Only used in APIv1 I think. // A first-step would be to delete the Spaces after pulling them out of a Tweet into the Trove // in ParseTweetResponse. Then they will only be getting saved once rather than twice. Spaces []Space - SpaceID SpaceID + SpaceID SpaceID `db:"space_id"` - TombstoneType string - IsStub bool + TombstoneType string `db:"tombstone_type"` + IsStub bool `db:"is_stub"` - IsContentDownloaded bool - IsConversationScraped bool - LastScrapedAt Timestamp + IsContentDownloaded bool `db:"is_content_downloaded"` + IsConversationScraped bool `db:"is_conversation_scraped"` + LastScrapedAt Timestamp `db:"last_scraped_at"` } func (t Tweet) String() string { @@ -150,14 +171,14 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { // Process `@` mentions and reply-mentions for _, mention := range apiTweet.Entities.Mentions { - ret.Mentions = append(ret.Mentions, UserHandle(mention.UserName)) + ret.Mentions = append(ret.Mentions, mention.UserName) } for _, mention := range strings.Split(apiTweet.Entities.ReplyMentions, " ") { if mention != "" { if mention[0] != '@' { panic(fmt.Errorf("Unknown ReplyMention value %q:\n %w", apiTweet.Entities.ReplyMentions, EXTERNAL_API_ERROR)) } - ret.ReplyMentions = append(ret.ReplyMentions, UserHandle(mention[1:])) + ret.ReplyMentions = append(ret.ReplyMentions, mention[1:]) } } diff --git a/scraper/tweet_test.go b/scraper/tweet_test.go index cd27600..0c0237e 100644 --- a/scraper/tweet_test.go +++ b/scraper/tweet_test.go @@ -35,7 +35,7 @@ func TestParseSingleTweet(t *testing.T) { assert.Equal("The fact that @michaelmalice new book ‘The Anarchist Handbook’ is just absolutely destroying on the charts is the "+ "largest white pill I’ve swallowed in years.", tweet.Text) assert.Len(tweet.Mentions, 1) - assert.Contains(tweet.Mentions, UserHandle("michaelmalice")) + assert.Contains(tweet.Mentions, "michaelmalice") assert.Empty(tweet.Urls) assert.Equal(int64(1621639105), tweet.PostedAt.Unix()) assert.Zero(tweet.QuotedTweetID) @@ -179,7 +179,7 @@ func TestTweetWithLotsOfReplyMentions(t *testing.T) { tweet := load_tweet_from_file("test_responses/single_tweets/tweet_with_at_mentions_in_front.json") assert.Len(tweet.ReplyMentions, 4) - for i, v := range []UserHandle{"rob_mose", "primalpoly", "jmasseypoet", "SpaceX"} { + for i, v := range []string{"rob_mose", "primalpoly", "jmasseypoet", "SpaceX"} { assert.Equal(v, tweet.ReplyMentions[i]) } }