BUGFIX: don't clobber num_likes, num_retweets etc. if tweet is a stub

- Convert Tweet queries to sqlx, including new CommaSeparatedList type to convert Hashtags, Mentions and ReplyMentions
This commit is contained in:
Alessio 2023-06-03 08:30:49 -03:00
parent dea37c7556
commit 815a8180da
6 changed files with 93 additions and 96 deletions

View File

@ -4,7 +4,6 @@ import (
"database/sql" "database/sql"
"errors" "errors"
"fmt" "fmt"
"strings"
"offline_twitter/scraper" "offline_twitter/scraper"
) )
@ -22,42 +21,36 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
} }
} }
_, err := db.Exec(` _, err := db.NamedExec(`
insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id,
quoted_tweet_id, mentions, reply_mentions, hashtags, space_id, tombstone_type, is_stub, is_content_downloaded, quoted_tweet_id, mentions, reply_mentions, hashtags, space_id, tombstone_type, is_stub, is_content_downloaded,
is_conversation_scraped, last_scraped_at) is_conversation_scraped, last_scraped_at)
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, nullif(?, ''), (select rowid from tombstone_types where short_name=?), ?, ?, ?, ?) values (:id, :user_id, :text, :posted_at, :num_likes, :num_retweets, :num_replies, :num_quote_tweets, :in_reply_to_id, :quoted_tweet_id, :mentions, :reply_mentions, :hashtags, nullif(:space_id, ''), (select rowid from tombstone_types where short_name=:tombstone_type), :is_stub, :is_content_downloaded, :is_conversation_scraped, :last_scraped_at)
on conflict do update on conflict do update
set text=(case set text=(case
when is_stub then when is_stub then
? :text
else else
text text
end end
), ),
num_likes=?, num_likes=(case when :is_stub then num_likes else :num_likes end),
num_retweets=?, num_retweets=(case when :is_stub then num_retweets else :num_retweets end),
num_replies=?, num_replies=(case when :is_stub then num_replies else :num_replies end),
num_quote_tweets=?, num_quote_tweets=(case when :is_stub then num_quote_tweets else :num_quote_tweets end),
is_stub=(is_stub and ?), is_stub=(is_stub and :is_stub),
tombstone_type=(case tombstone_type=(case
when ?='unavailable' and tombstone_type not in (0, 4) then when :tombstone_type='unavailable' and tombstone_type not in (0, 4) then
tombstone_type tombstone_type
else else
(select rowid from tombstone_types where short_name=?) (select rowid from tombstone_types where short_name=:tombstone_type)
end end
), ),
is_content_downloaded=(is_content_downloaded or ?), is_content_downloaded=(is_content_downloaded or :is_content_downloaded),
is_conversation_scraped=(is_conversation_scraped or ?), is_conversation_scraped=(is_conversation_scraped or :is_conversation_scraped),
last_scraped_at=max(last_scraped_at, ?) last_scraped_at=max(last_scraped_at, :last_scraped_at)
`, `,
t.ID, t.UserID, t.Text, t.PostedAt, t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyToID, t,
t.QuotedTweetID, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions),
strings.Join(t.Hashtags, ","), t.SpaceID, t.TombstoneType, t.IsStub, t.IsContentDownloaded, t.IsConversationScraped,
t.LastScrapedAt,
t.Text, t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsStub, t.TombstoneType, t.TombstoneType,
t.IsContentDownloaded, t.IsConversationScraped, t.LastScrapedAt,
) )
if err != nil { if err != nil {
@ -119,82 +112,53 @@ func (p Profile) IsTweetInDatabase(id scraper.TweetID) bool {
func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) { func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
db := p.DB db := p.DB
stmt, err := db.Prepare(` var t scraper.Tweet
err := db.Get(&t, `
select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, quoted_tweet_id, select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, quoted_tweet_id,
mentions, reply_mentions, hashtags, ifnull(space_id, ''), ifnull(tombstone_types.short_name, ""), is_stub, mentions, reply_mentions, hashtags, ifnull(space_id, '') space_id, ifnull(tombstone_types.short_name, "") tombstone_type, is_stub,
is_content_downloaded, is_conversation_scraped, last_scraped_at is_content_downloaded, is_conversation_scraped, last_scraped_at
from tweets left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid from tweets left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid
where id = ? where id = ?
`) `, id)
if err != nil { if err != nil {
return scraper.Tweet{}, fmt.Errorf("Error preparing statement in GetTweetByID(%d):\n %w", id, err) return scraper.Tweet{}, fmt.Errorf("Error executing GetTweetByID(%d):\n %w", id, err)
}
defer stmt.Close()
var t scraper.Tweet
var mentions string
var reply_mentions string
var hashtags string
row := stmt.QueryRow(id)
err = row.Scan(&t.ID, &t.UserID, &t.Text, &t.PostedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyToID,
&t.QuotedTweetID, &mentions, &reply_mentions, &hashtags, &t.SpaceID, &t.TombstoneType, &t.IsStub, &t.IsContentDownloaded,
&t.IsConversationScraped, &t.LastScrapedAt)
if err != nil {
return t, fmt.Errorf("Error parsing result in GetTweetByID(%d):\n %w", id, err)
}
t.Mentions = []scraper.UserHandle{}
for _, m := range strings.Split(mentions, ",") {
if m != "" {
t.Mentions = append(t.Mentions, scraper.UserHandle(m))
}
}
t.ReplyMentions = []scraper.UserHandle{}
for _, m := range strings.Split(reply_mentions, ",") {
if m != "" {
t.ReplyMentions = append(t.ReplyMentions, scraper.UserHandle(m))
}
}
t.Hashtags = []string{}
for _, h := range strings.Split(hashtags, ",") {
if h != "" {
t.Hashtags = append(t.Hashtags, h)
}
} }
t.Spaces = []scraper.Space{} t.Spaces = []scraper.Space{}
if t.SpaceID != "" { if t.SpaceID != "" {
space, err := p.GetSpaceById(t.SpaceID) space, err := p.GetSpaceById(t.SpaceID)
if err != nil { if err != nil {
return t, err return t, fmt.Errorf("Error retrieving space with ID %s (tweet %d):\n %w", t.SpaceID, t.ID, err)
} }
t.Spaces = append(t.Spaces, space) t.Spaces = append(t.Spaces, space)
} }
imgs, err := p.GetImagesForTweet(t) imgs, err := p.GetImagesForTweet(t)
if err != nil { if err != nil {
return t, err return t, fmt.Errorf("Error retrieving images for tweet %d:\n %w", t.ID, err)
} }
t.Images = imgs t.Images = imgs
vids, err := p.GetVideosForTweet(t) vids, err := p.GetVideosForTweet(t)
if err != nil { if err != nil {
return t, err return t, fmt.Errorf("Error retrieving videos for tweet %d:\n %w", t.ID, err)
} }
t.Videos = vids t.Videos = vids
polls, err := p.GetPollsForTweet(t) polls, err := p.GetPollsForTweet(t)
if err != nil { if err != nil {
return t, err return t, fmt.Errorf("Error retrieving polls for tweet %d:\n %w", t.ID, err)
} }
t.Polls = polls t.Polls = polls
urls, err := p.GetUrlsForTweet(t) urls, err := p.GetUrlsForTweet(t)
if err != nil {
return t, fmt.Errorf("Error retrieving urls for tweet %d:\n %w", t.ID, err)
}
t.Urls = urls t.Urls = urls
return t, err return t, nil
} }
/** /**

View File

@ -77,6 +77,10 @@ func TestNoWorseningTweet(t *testing.T) {
tweet.IsConversationScraped = true tweet.IsConversationScraped = true
tweet.LastScrapedAt = scraper.TimestampFromUnix(1000) tweet.LastScrapedAt = scraper.TimestampFromUnix(1000)
tweet.Text = "Yes text" tweet.Text = "Yes text"
tweet.NumLikes = 10
tweet.NumRetweets = 11
tweet.NumQuoteTweets = 12
tweet.NumReplies = 13
// Save the tweet // Save the tweet
err := profile.SaveTweet(tweet) err := profile.SaveTweet(tweet)
@ -90,6 +94,10 @@ func TestNoWorseningTweet(t *testing.T) {
tweet.Text = "" tweet.Text = ""
err = profile.SaveTweet(tweet) err = profile.SaveTweet(tweet)
require.NoError(err) require.NoError(err)
tweet.NumLikes = 0
tweet.NumRetweets = 0
tweet.NumQuoteTweets = 0
tweet.NumReplies = 0
// Reload the tweet // Reload the tweet
new_tweet, err := profile.GetTweetById(tweet.ID) new_tweet, err := profile.GetTweetById(tweet.ID)
@ -100,6 +108,10 @@ func TestNoWorseningTweet(t *testing.T) {
assert.True(new_tweet.IsConversationScraped, "Should have preserved is-conversation-scraped status") assert.True(new_tweet.IsConversationScraped, "Should have preserved is-conversation-scraped status")
assert.Equal(int64(1000), new_tweet.LastScrapedAt.Unix(), "Should have preserved last-scraped-at time") assert.Equal(int64(1000), new_tweet.LastScrapedAt.Unix(), "Should have preserved last-scraped-at time")
assert.Equal(new_tweet.Text, "Yes text", "Text should not get clobbered if it becomes unavailable") assert.Equal(new_tweet.Text, "Yes text", "Text should not get clobbered if it becomes unavailable")
assert.Equal(10, new_tweet.NumLikes)
assert.Equal(11, new_tweet.NumRetweets)
assert.Equal(12, new_tweet.NumQuoteTweets)
assert.Equal(13, new_tweet.NumReplies)
} }
/** /**

View File

@ -166,8 +166,8 @@ func create_stable_tweet() scraper.Tweet {
Images: []scraper.Image{ Images: []scraper.Image{
create_image_from_id(-1), create_image_from_id(-1),
}, },
Mentions: []scraper.UserHandle{}, Mentions: scraper.CommaSeparatedList{},
Hashtags: []string{}, Hashtags: scraper.CommaSeparatedList{},
Polls: []scraper.Poll{ Polls: []scraper.Poll{
create_poll_from_id(-1), create_poll_from_id(-1),
}, },
@ -257,9 +257,9 @@ func create_dummy_tweet() scraper.Tweet {
Videos: []scraper.Video{vid}, Videos: []scraper.Video{vid},
Urls: []scraper.Url{url1, url2}, Urls: []scraper.Url{url1, url2},
Images: []scraper.Image{img1, img2}, Images: []scraper.Image{img1, img2},
Mentions: []scraper.UserHandle{"mention1", "mention2"}, Mentions: scraper.CommaSeparatedList{"mention1", "mention2"},
ReplyMentions: []scraper.UserHandle{"replymention1", "replymention2"}, ReplyMentions: scraper.CommaSeparatedList{"replymention1", "replymention2"},
Hashtags: []string{"hash1", "hash2"}, Hashtags: scraper.CommaSeparatedList{"hash1", "hash2"},
Polls: []scraper.Poll{poll}, Polls: []scraper.Poll{poll},
Spaces: []scraper.Space{space}, Spaces: []scraper.Space{space},
SpaceID: space_id, SpaceID: space_id,
@ -278,9 +278,9 @@ func create_dummy_tombstone() scraper.Tweet {
UserID: -1, UserID: -1,
TombstoneType: "deleted", TombstoneType: "deleted",
IsStub: true, IsStub: true,
Mentions: []scraper.UserHandle{}, Mentions: scraper.CommaSeparatedList{},
ReplyMentions: []scraper.UserHandle{}, ReplyMentions: scraper.CommaSeparatedList{},
Hashtags: []string{}, Hashtags: scraper.CommaSeparatedList{},
Spaces: []scraper.Space{}, Spaces: []scraper.Space{},
} }
} }

View File

@ -125,7 +125,7 @@ func TestAPIV2ParseTweetWithQuotedTweet(t *testing.T) {
assert.Equal(TweetID(1485689207435710464), quoted_tweet.InReplyToID) assert.Equal(TweetID(1485689207435710464), quoted_tweet.InReplyToID)
assert.Equal(TweetID(0), quoted_tweet.QuotedTweetID) assert.Equal(TweetID(0), quoted_tweet.QuotedTweetID)
assert.Equal(1, len(quoted_tweet.ReplyMentions)) assert.Equal(1, len(quoted_tweet.ReplyMentions))
assert.Contains(quoted_tweet.ReplyMentions, UserHandle("michaelmalice")) assert.Contains(quoted_tweet.ReplyMentions, "michaelmalice")
assert.Equal(1, quoted_tweet.NumReplies) assert.Equal(1, quoted_tweet.NumReplies)
assert.Equal(12, quoted_tweet.NumLikes) assert.Equal(12, quoted_tweet.NumLikes)

View File

@ -1,6 +1,7 @@
package scraper package scraper
import ( import (
"database/sql/driver"
"fmt" "fmt"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"strings" "strings"
@ -9,44 +10,64 @@ import (
"offline_twitter/terminal_utils" "offline_twitter/terminal_utils"
) )
const DEFAULT_MAX_REPLIES_EAGER_LOAD = 50
type TweetID int64 type TweetID int64
type CommaSeparatedList []string
func (l *CommaSeparatedList) Scan(src interface{}) error {
*l = CommaSeparatedList{}
switch src.(type) {
case string:
for _, v := range strings.Split(src.(string), ",") {
if v != "" {
*l = append(*l, v)
}
}
default:
panic("Should be a string")
}
return nil
}
func (l CommaSeparatedList) Value() (driver.Value, error) {
return strings.Join(l, ","), nil
}
type Tweet struct { type Tweet struct {
ID TweetID ID TweetID `db:"id"`
UserID UserID UserID UserID `db:"user_id"`
UserHandle UserHandle // For processing tombstones UserHandle UserHandle // For processing tombstones
User *User User *User
Text string Text string `db:"text"`
PostedAt Timestamp IsExpandable bool `db:"is_expandable"`
NumLikes int PostedAt Timestamp `db:"posted_at"`
NumRetweets int NumLikes int `db:"num_likes"`
NumReplies int NumRetweets int `db:"num_retweets"`
NumQuoteTweets int NumReplies int `db:"num_replies"`
InReplyToID TweetID NumQuoteTweets int `db:"num_quote_tweets"`
QuotedTweetID TweetID InReplyToID TweetID `db:"in_reply_to_id"`
QuotedTweetID TweetID `db:"quoted_tweet_id"`
Images []Image Images []Image
Videos []Video Videos []Video
Mentions []UserHandle
ReplyMentions []UserHandle
Hashtags []string
Urls []Url Urls []Url
Polls []Poll Polls []Poll
Mentions CommaSeparatedList `db:"mentions"`
ReplyMentions CommaSeparatedList `db:"reply_mentions"`
Hashtags CommaSeparatedList `db:"hashtags"`
// TODO get-rid-of-spaces: Might be good to get rid of `Spaces`. Only used in APIv1 I think. // TODO get-rid-of-spaces: Might be good to get rid of `Spaces`. Only used in APIv1 I think.
// A first-step would be to delete the Spaces after pulling them out of a Tweet into the Trove // A first-step would be to delete the Spaces after pulling them out of a Tweet into the Trove
// in ParseTweetResponse. Then they will only be getting saved once rather than twice. // in ParseTweetResponse. Then they will only be getting saved once rather than twice.
Spaces []Space Spaces []Space
SpaceID SpaceID SpaceID SpaceID `db:"space_id"`
TombstoneType string TombstoneType string `db:"tombstone_type"`
IsStub bool IsStub bool `db:"is_stub"`
IsContentDownloaded bool IsContentDownloaded bool `db:"is_content_downloaded"`
IsConversationScraped bool IsConversationScraped bool `db:"is_conversation_scraped"`
LastScrapedAt Timestamp LastScrapedAt Timestamp `db:"last_scraped_at"`
} }
func (t Tweet) String() string { func (t Tweet) String() string {
@ -150,14 +171,14 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
// Process `@` mentions and reply-mentions // Process `@` mentions and reply-mentions
for _, mention := range apiTweet.Entities.Mentions { for _, mention := range apiTweet.Entities.Mentions {
ret.Mentions = append(ret.Mentions, UserHandle(mention.UserName)) ret.Mentions = append(ret.Mentions, mention.UserName)
} }
for _, mention := range strings.Split(apiTweet.Entities.ReplyMentions, " ") { for _, mention := range strings.Split(apiTweet.Entities.ReplyMentions, " ") {
if mention != "" { if mention != "" {
if mention[0] != '@' { if mention[0] != '@' {
panic(fmt.Errorf("Unknown ReplyMention value %q:\n %w", apiTweet.Entities.ReplyMentions, EXTERNAL_API_ERROR)) panic(fmt.Errorf("Unknown ReplyMention value %q:\n %w", apiTweet.Entities.ReplyMentions, EXTERNAL_API_ERROR))
} }
ret.ReplyMentions = append(ret.ReplyMentions, UserHandle(mention[1:])) ret.ReplyMentions = append(ret.ReplyMentions, mention[1:])
} }
} }

View File

@ -35,7 +35,7 @@ func TestParseSingleTweet(t *testing.T) {
assert.Equal("The fact that @michaelmalice new book The Anarchist Handbook is just absolutely destroying on the charts is the "+ assert.Equal("The fact that @michaelmalice new book The Anarchist Handbook is just absolutely destroying on the charts is the "+
"largest white pill Ive swallowed in years.", tweet.Text) "largest white pill Ive swallowed in years.", tweet.Text)
assert.Len(tweet.Mentions, 1) assert.Len(tweet.Mentions, 1)
assert.Contains(tweet.Mentions, UserHandle("michaelmalice")) assert.Contains(tweet.Mentions, "michaelmalice")
assert.Empty(tweet.Urls) assert.Empty(tweet.Urls)
assert.Equal(int64(1621639105), tweet.PostedAt.Unix()) assert.Equal(int64(1621639105), tweet.PostedAt.Unix())
assert.Zero(tweet.QuotedTweetID) assert.Zero(tweet.QuotedTweetID)
@ -179,7 +179,7 @@ func TestTweetWithLotsOfReplyMentions(t *testing.T) {
tweet := load_tweet_from_file("test_responses/single_tweets/tweet_with_at_mentions_in_front.json") tweet := load_tweet_from_file("test_responses/single_tweets/tweet_with_at_mentions_in_front.json")
assert.Len(tweet.ReplyMentions, 4) assert.Len(tweet.ReplyMentions, 4)
for i, v := range []UserHandle{"rob_mose", "primalpoly", "jmasseypoet", "SpaceX"} { for i, v := range []string{"rob_mose", "primalpoly", "jmasseypoet", "SpaceX"} {
assert.Equal(v, tweet.ReplyMentions[i]) assert.Equal(v, tweet.ReplyMentions[i])
} }
} }