BUGFIX: don't clobber num_likes, num_retweets etc. if tweet is a stub

- Convert Tweet queries to sqlx, including new CommaSeparatedList type to convert Hashtags, Mentions and ReplyMentions
This commit is contained in:
Alessio 2023-06-03 08:30:49 -03:00
parent dea37c7556
commit 815a8180da
6 changed files with 93 additions and 96 deletions

View File

@ -4,7 +4,6 @@ import (
"database/sql"
"errors"
"fmt"
"strings"
"offline_twitter/scraper"
)
@ -22,42 +21,36 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
}
}
_, err := db.Exec(`
_, err := db.NamedExec(`
insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id,
quoted_tweet_id, mentions, reply_mentions, hashtags, space_id, tombstone_type, is_stub, is_content_downloaded,
is_conversation_scraped, last_scraped_at)
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, nullif(?, ''), (select rowid from tombstone_types where short_name=?), ?, ?, ?, ?)
values (:id, :user_id, :text, :posted_at, :num_likes, :num_retweets, :num_replies, :num_quote_tweets, :in_reply_to_id, :quoted_tweet_id, :mentions, :reply_mentions, :hashtags, nullif(:space_id, ''), (select rowid from tombstone_types where short_name=:tombstone_type), :is_stub, :is_content_downloaded, :is_conversation_scraped, :last_scraped_at)
on conflict do update
set text=(case
when is_stub then
?
:text
else
text
end
),
num_likes=?,
num_retweets=?,
num_replies=?,
num_quote_tweets=?,
is_stub=(is_stub and ?),
num_likes=(case when :is_stub then num_likes else :num_likes end),
num_retweets=(case when :is_stub then num_retweets else :num_retweets end),
num_replies=(case when :is_stub then num_replies else :num_replies end),
num_quote_tweets=(case when :is_stub then num_quote_tweets else :num_quote_tweets end),
is_stub=(is_stub and :is_stub),
tombstone_type=(case
when ?='unavailable' and tombstone_type not in (0, 4) then
when :tombstone_type='unavailable' and tombstone_type not in (0, 4) then
tombstone_type
else
(select rowid from tombstone_types where short_name=?)
(select rowid from tombstone_types where short_name=:tombstone_type)
end
),
is_content_downloaded=(is_content_downloaded or ?),
is_conversation_scraped=(is_conversation_scraped or ?),
last_scraped_at=max(last_scraped_at, ?)
is_content_downloaded=(is_content_downloaded or :is_content_downloaded),
is_conversation_scraped=(is_conversation_scraped or :is_conversation_scraped),
last_scraped_at=max(last_scraped_at, :last_scraped_at)
`,
t.ID, t.UserID, t.Text, t.PostedAt, t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyToID,
t.QuotedTweetID, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions),
strings.Join(t.Hashtags, ","), t.SpaceID, t.TombstoneType, t.IsStub, t.IsContentDownloaded, t.IsConversationScraped,
t.LastScrapedAt,
t.Text, t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsStub, t.TombstoneType, t.TombstoneType,
t.IsContentDownloaded, t.IsConversationScraped, t.LastScrapedAt,
t,
)
if err != nil {
@ -119,82 +112,53 @@ func (p Profile) IsTweetInDatabase(id scraper.TweetID) bool {
func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
db := p.DB
stmt, err := db.Prepare(`
var t scraper.Tweet
err := db.Get(&t, `
select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, quoted_tweet_id,
mentions, reply_mentions, hashtags, ifnull(space_id, ''), ifnull(tombstone_types.short_name, ""), is_stub,
mentions, reply_mentions, hashtags, ifnull(space_id, '') space_id, ifnull(tombstone_types.short_name, "") tombstone_type, is_stub,
is_content_downloaded, is_conversation_scraped, last_scraped_at
from tweets left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid
where id = ?
`)
`, id)
if err != nil {
return scraper.Tweet{}, fmt.Errorf("Error preparing statement in GetTweetByID(%d):\n %w", id, err)
}
defer stmt.Close()
var t scraper.Tweet
var mentions string
var reply_mentions string
var hashtags string
row := stmt.QueryRow(id)
err = row.Scan(&t.ID, &t.UserID, &t.Text, &t.PostedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyToID,
&t.QuotedTweetID, &mentions, &reply_mentions, &hashtags, &t.SpaceID, &t.TombstoneType, &t.IsStub, &t.IsContentDownloaded,
&t.IsConversationScraped, &t.LastScrapedAt)
if err != nil {
return t, fmt.Errorf("Error parsing result in GetTweetByID(%d):\n %w", id, err)
}
t.Mentions = []scraper.UserHandle{}
for _, m := range strings.Split(mentions, ",") {
if m != "" {
t.Mentions = append(t.Mentions, scraper.UserHandle(m))
}
}
t.ReplyMentions = []scraper.UserHandle{}
for _, m := range strings.Split(reply_mentions, ",") {
if m != "" {
t.ReplyMentions = append(t.ReplyMentions, scraper.UserHandle(m))
}
}
t.Hashtags = []string{}
for _, h := range strings.Split(hashtags, ",") {
if h != "" {
t.Hashtags = append(t.Hashtags, h)
}
return scraper.Tweet{}, fmt.Errorf("Error executing GetTweetByID(%d):\n %w", id, err)
}
t.Spaces = []scraper.Space{}
if t.SpaceID != "" {
space, err := p.GetSpaceById(t.SpaceID)
if err != nil {
return t, err
return t, fmt.Errorf("Error retrieving space with ID %s (tweet %d):\n %w", t.SpaceID, t.ID, err)
}
t.Spaces = append(t.Spaces, space)
}
imgs, err := p.GetImagesForTweet(t)
if err != nil {
return t, err
return t, fmt.Errorf("Error retrieving images for tweet %d:\n %w", t.ID, err)
}
t.Images = imgs
vids, err := p.GetVideosForTweet(t)
if err != nil {
return t, err
return t, fmt.Errorf("Error retrieving videos for tweet %d:\n %w", t.ID, err)
}
t.Videos = vids
polls, err := p.GetPollsForTweet(t)
if err != nil {
return t, err
return t, fmt.Errorf("Error retrieving polls for tweet %d:\n %w", t.ID, err)
}
t.Polls = polls
urls, err := p.GetUrlsForTweet(t)
if err != nil {
return t, fmt.Errorf("Error retrieving urls for tweet %d:\n %w", t.ID, err)
}
t.Urls = urls
return t, err
return t, nil
}
/**

View File

@ -77,6 +77,10 @@ func TestNoWorseningTweet(t *testing.T) {
tweet.IsConversationScraped = true
tweet.LastScrapedAt = scraper.TimestampFromUnix(1000)
tweet.Text = "Yes text"
tweet.NumLikes = 10
tweet.NumRetweets = 11
tweet.NumQuoteTweets = 12
tweet.NumReplies = 13
// Save the tweet
err := profile.SaveTweet(tweet)
@ -90,6 +94,10 @@ func TestNoWorseningTweet(t *testing.T) {
tweet.Text = ""
err = profile.SaveTweet(tweet)
require.NoError(err)
tweet.NumLikes = 0
tweet.NumRetweets = 0
tweet.NumQuoteTweets = 0
tweet.NumReplies = 0
// Reload the tweet
new_tweet, err := profile.GetTweetById(tweet.ID)
@ -100,6 +108,10 @@ func TestNoWorseningTweet(t *testing.T) {
assert.True(new_tweet.IsConversationScraped, "Should have preserved is-conversation-scraped status")
assert.Equal(int64(1000), new_tweet.LastScrapedAt.Unix(), "Should have preserved last-scraped-at time")
assert.Equal(new_tweet.Text, "Yes text", "Text should not get clobbered if it becomes unavailable")
assert.Equal(10, new_tweet.NumLikes)
assert.Equal(11, new_tweet.NumRetweets)
assert.Equal(12, new_tweet.NumQuoteTweets)
assert.Equal(13, new_tweet.NumReplies)
}
/**

View File

@ -166,8 +166,8 @@ func create_stable_tweet() scraper.Tweet {
Images: []scraper.Image{
create_image_from_id(-1),
},
Mentions: []scraper.UserHandle{},
Hashtags: []string{},
Mentions: scraper.CommaSeparatedList{},
Hashtags: scraper.CommaSeparatedList{},
Polls: []scraper.Poll{
create_poll_from_id(-1),
},
@ -257,9 +257,9 @@ func create_dummy_tweet() scraper.Tweet {
Videos: []scraper.Video{vid},
Urls: []scraper.Url{url1, url2},
Images: []scraper.Image{img1, img2},
Mentions: []scraper.UserHandle{"mention1", "mention2"},
ReplyMentions: []scraper.UserHandle{"replymention1", "replymention2"},
Hashtags: []string{"hash1", "hash2"},
Mentions: scraper.CommaSeparatedList{"mention1", "mention2"},
ReplyMentions: scraper.CommaSeparatedList{"replymention1", "replymention2"},
Hashtags: scraper.CommaSeparatedList{"hash1", "hash2"},
Polls: []scraper.Poll{poll},
Spaces: []scraper.Space{space},
SpaceID: space_id,
@ -278,9 +278,9 @@ func create_dummy_tombstone() scraper.Tweet {
UserID: -1,
TombstoneType: "deleted",
IsStub: true,
Mentions: []scraper.UserHandle{},
ReplyMentions: []scraper.UserHandle{},
Hashtags: []string{},
Mentions: scraper.CommaSeparatedList{},
ReplyMentions: scraper.CommaSeparatedList{},
Hashtags: scraper.CommaSeparatedList{},
Spaces: []scraper.Space{},
}
}

View File

@ -125,7 +125,7 @@ func TestAPIV2ParseTweetWithQuotedTweet(t *testing.T) {
assert.Equal(TweetID(1485689207435710464), quoted_tweet.InReplyToID)
assert.Equal(TweetID(0), quoted_tweet.QuotedTweetID)
assert.Equal(1, len(quoted_tweet.ReplyMentions))
assert.Contains(quoted_tweet.ReplyMentions, UserHandle("michaelmalice"))
assert.Contains(quoted_tweet.ReplyMentions, "michaelmalice")
assert.Equal(1, quoted_tweet.NumReplies)
assert.Equal(12, quoted_tweet.NumLikes)

View File

@ -1,6 +1,7 @@
package scraper
import (
"database/sql/driver"
"fmt"
log "github.com/sirupsen/logrus"
"strings"
@ -9,44 +10,64 @@ import (
"offline_twitter/terminal_utils"
)
const DEFAULT_MAX_REPLIES_EAGER_LOAD = 50
type TweetID int64
type CommaSeparatedList []string
func (l *CommaSeparatedList) Scan(src interface{}) error {
*l = CommaSeparatedList{}
switch src.(type) {
case string:
for _, v := range strings.Split(src.(string), ",") {
if v != "" {
*l = append(*l, v)
}
}
default:
panic("Should be a string")
}
return nil
}
func (l CommaSeparatedList) Value() (driver.Value, error) {
return strings.Join(l, ","), nil
}
type Tweet struct {
ID TweetID
UserID UserID
ID TweetID `db:"id"`
UserID UserID `db:"user_id"`
UserHandle UserHandle // For processing tombstones
User *User
Text string
PostedAt Timestamp
NumLikes int
NumRetweets int
NumReplies int
NumQuoteTweets int
InReplyToID TweetID
QuotedTweetID TweetID
Text string `db:"text"`
IsExpandable bool `db:"is_expandable"`
PostedAt Timestamp `db:"posted_at"`
NumLikes int `db:"num_likes"`
NumRetweets int `db:"num_retweets"`
NumReplies int `db:"num_replies"`
NumQuoteTweets int `db:"num_quote_tweets"`
InReplyToID TweetID `db:"in_reply_to_id"`
QuotedTweetID TweetID `db:"quoted_tweet_id"`
Images []Image
Videos []Video
Mentions []UserHandle
ReplyMentions []UserHandle
Hashtags []string
Urls []Url
Polls []Poll
Mentions CommaSeparatedList `db:"mentions"`
ReplyMentions CommaSeparatedList `db:"reply_mentions"`
Hashtags CommaSeparatedList `db:"hashtags"`
// TODO get-rid-of-spaces: Might be good to get rid of `Spaces`. Only used in APIv1 I think.
// A first-step would be to delete the Spaces after pulling them out of a Tweet into the Trove
// in ParseTweetResponse. Then they will only be getting saved once rather than twice.
Spaces []Space
SpaceID SpaceID
SpaceID SpaceID `db:"space_id"`
TombstoneType string
IsStub bool
TombstoneType string `db:"tombstone_type"`
IsStub bool `db:"is_stub"`
IsContentDownloaded bool
IsConversationScraped bool
LastScrapedAt Timestamp
IsContentDownloaded bool `db:"is_content_downloaded"`
IsConversationScraped bool `db:"is_conversation_scraped"`
LastScrapedAt Timestamp `db:"last_scraped_at"`
}
func (t Tweet) String() string {
@ -150,14 +171,14 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
// Process `@` mentions and reply-mentions
for _, mention := range apiTweet.Entities.Mentions {
ret.Mentions = append(ret.Mentions, UserHandle(mention.UserName))
ret.Mentions = append(ret.Mentions, mention.UserName)
}
for _, mention := range strings.Split(apiTweet.Entities.ReplyMentions, " ") {
if mention != "" {
if mention[0] != '@' {
panic(fmt.Errorf("Unknown ReplyMention value %q:\n %w", apiTweet.Entities.ReplyMentions, EXTERNAL_API_ERROR))
}
ret.ReplyMentions = append(ret.ReplyMentions, UserHandle(mention[1:]))
ret.ReplyMentions = append(ret.ReplyMentions, mention[1:])
}
}

View File

@ -35,7 +35,7 @@ func TestParseSingleTweet(t *testing.T) {
assert.Equal("The fact that @michaelmalice new book The Anarchist Handbook is just absolutely destroying on the charts is the "+
"largest white pill Ive swallowed in years.", tweet.Text)
assert.Len(tweet.Mentions, 1)
assert.Contains(tweet.Mentions, UserHandle("michaelmalice"))
assert.Contains(tweet.Mentions, "michaelmalice")
assert.Empty(tweet.Urls)
assert.Equal(int64(1621639105), tweet.PostedAt.Unix())
assert.Zero(tweet.QuotedTweetID)
@ -179,7 +179,7 @@ func TestTweetWithLotsOfReplyMentions(t *testing.T) {
tweet := load_tweet_from_file("test_responses/single_tweets/tweet_with_at_mentions_in_front.json")
assert.Len(tweet.ReplyMentions, 4)
for i, v := range []UserHandle{"rob_mose", "primalpoly", "jmasseypoet", "SpaceX"} {
for i, v := range []string{"rob_mose", "primalpoly", "jmasseypoet", "SpaceX"} {
assert.Equal(v, tweet.ReplyMentions[i])
}
}