BUGFIX: don't clobber num_likes, num_retweets etc. if tweet is a stub
- Convert Tweet queries to sqlx, including new CommaSeparatedList type to convert Hashtags, Mentions and ReplyMentions
This commit is contained in:
parent
dea37c7556
commit
815a8180da
@ -4,7 +4,6 @@ import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"offline_twitter/scraper"
|
||||
)
|
||||
@ -22,42 +21,36 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
|
||||
}
|
||||
}
|
||||
|
||||
_, err := db.Exec(`
|
||||
_, err := db.NamedExec(`
|
||||
insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id,
|
||||
quoted_tweet_id, mentions, reply_mentions, hashtags, space_id, tombstone_type, is_stub, is_content_downloaded,
|
||||
is_conversation_scraped, last_scraped_at)
|
||||
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, nullif(?, ''), (select rowid from tombstone_types where short_name=?), ?, ?, ?, ?)
|
||||
values (:id, :user_id, :text, :posted_at, :num_likes, :num_retweets, :num_replies, :num_quote_tweets, :in_reply_to_id, :quoted_tweet_id, :mentions, :reply_mentions, :hashtags, nullif(:space_id, ''), (select rowid from tombstone_types where short_name=:tombstone_type), :is_stub, :is_content_downloaded, :is_conversation_scraped, :last_scraped_at)
|
||||
on conflict do update
|
||||
set text=(case
|
||||
when is_stub then
|
||||
?
|
||||
:text
|
||||
else
|
||||
text
|
||||
end
|
||||
),
|
||||
num_likes=?,
|
||||
num_retweets=?,
|
||||
num_replies=?,
|
||||
num_quote_tweets=?,
|
||||
is_stub=(is_stub and ?),
|
||||
num_likes=(case when :is_stub then num_likes else :num_likes end),
|
||||
num_retweets=(case when :is_stub then num_retweets else :num_retweets end),
|
||||
num_replies=(case when :is_stub then num_replies else :num_replies end),
|
||||
num_quote_tweets=(case when :is_stub then num_quote_tweets else :num_quote_tweets end),
|
||||
is_stub=(is_stub and :is_stub),
|
||||
tombstone_type=(case
|
||||
when ?='unavailable' and tombstone_type not in (0, 4) then
|
||||
when :tombstone_type='unavailable' and tombstone_type not in (0, 4) then
|
||||
tombstone_type
|
||||
else
|
||||
(select rowid from tombstone_types where short_name=?)
|
||||
(select rowid from tombstone_types where short_name=:tombstone_type)
|
||||
end
|
||||
),
|
||||
is_content_downloaded=(is_content_downloaded or ?),
|
||||
is_conversation_scraped=(is_conversation_scraped or ?),
|
||||
last_scraped_at=max(last_scraped_at, ?)
|
||||
is_content_downloaded=(is_content_downloaded or :is_content_downloaded),
|
||||
is_conversation_scraped=(is_conversation_scraped or :is_conversation_scraped),
|
||||
last_scraped_at=max(last_scraped_at, :last_scraped_at)
|
||||
`,
|
||||
t.ID, t.UserID, t.Text, t.PostedAt, t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyToID,
|
||||
t.QuotedTweetID, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions),
|
||||
strings.Join(t.Hashtags, ","), t.SpaceID, t.TombstoneType, t.IsStub, t.IsContentDownloaded, t.IsConversationScraped,
|
||||
t.LastScrapedAt,
|
||||
|
||||
t.Text, t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsStub, t.TombstoneType, t.TombstoneType,
|
||||
t.IsContentDownloaded, t.IsConversationScraped, t.LastScrapedAt,
|
||||
t,
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
@ -119,82 +112,53 @@ func (p Profile) IsTweetInDatabase(id scraper.TweetID) bool {
|
||||
func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
|
||||
db := p.DB
|
||||
|
||||
stmt, err := db.Prepare(`
|
||||
var t scraper.Tweet
|
||||
err := db.Get(&t, `
|
||||
select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, quoted_tweet_id,
|
||||
mentions, reply_mentions, hashtags, ifnull(space_id, ''), ifnull(tombstone_types.short_name, ""), is_stub,
|
||||
mentions, reply_mentions, hashtags, ifnull(space_id, '') space_id, ifnull(tombstone_types.short_name, "") tombstone_type, is_stub,
|
||||
is_content_downloaded, is_conversation_scraped, last_scraped_at
|
||||
from tweets left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid
|
||||
where id = ?
|
||||
`)
|
||||
`, id)
|
||||
|
||||
if err != nil {
|
||||
return scraper.Tweet{}, fmt.Errorf("Error preparing statement in GetTweetByID(%d):\n %w", id, err)
|
||||
}
|
||||
defer stmt.Close()
|
||||
|
||||
var t scraper.Tweet
|
||||
var mentions string
|
||||
var reply_mentions string
|
||||
var hashtags string
|
||||
|
||||
row := stmt.QueryRow(id)
|
||||
err = row.Scan(&t.ID, &t.UserID, &t.Text, &t.PostedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyToID,
|
||||
&t.QuotedTweetID, &mentions, &reply_mentions, &hashtags, &t.SpaceID, &t.TombstoneType, &t.IsStub, &t.IsContentDownloaded,
|
||||
&t.IsConversationScraped, &t.LastScrapedAt)
|
||||
if err != nil {
|
||||
return t, fmt.Errorf("Error parsing result in GetTweetByID(%d):\n %w", id, err)
|
||||
}
|
||||
|
||||
t.Mentions = []scraper.UserHandle{}
|
||||
for _, m := range strings.Split(mentions, ",") {
|
||||
if m != "" {
|
||||
t.Mentions = append(t.Mentions, scraper.UserHandle(m))
|
||||
}
|
||||
}
|
||||
t.ReplyMentions = []scraper.UserHandle{}
|
||||
for _, m := range strings.Split(reply_mentions, ",") {
|
||||
if m != "" {
|
||||
t.ReplyMentions = append(t.ReplyMentions, scraper.UserHandle(m))
|
||||
}
|
||||
}
|
||||
t.Hashtags = []string{}
|
||||
for _, h := range strings.Split(hashtags, ",") {
|
||||
if h != "" {
|
||||
t.Hashtags = append(t.Hashtags, h)
|
||||
}
|
||||
return scraper.Tweet{}, fmt.Errorf("Error executing GetTweetByID(%d):\n %w", id, err)
|
||||
}
|
||||
|
||||
t.Spaces = []scraper.Space{}
|
||||
if t.SpaceID != "" {
|
||||
space, err := p.GetSpaceById(t.SpaceID)
|
||||
if err != nil {
|
||||
return t, err
|
||||
return t, fmt.Errorf("Error retrieving space with ID %s (tweet %d):\n %w", t.SpaceID, t.ID, err)
|
||||
}
|
||||
t.Spaces = append(t.Spaces, space)
|
||||
}
|
||||
|
||||
imgs, err := p.GetImagesForTweet(t)
|
||||
if err != nil {
|
||||
return t, err
|
||||
return t, fmt.Errorf("Error retrieving images for tweet %d:\n %w", t.ID, err)
|
||||
}
|
||||
t.Images = imgs
|
||||
|
||||
vids, err := p.GetVideosForTweet(t)
|
||||
if err != nil {
|
||||
return t, err
|
||||
return t, fmt.Errorf("Error retrieving videos for tweet %d:\n %w", t.ID, err)
|
||||
}
|
||||
t.Videos = vids
|
||||
|
||||
polls, err := p.GetPollsForTweet(t)
|
||||
if err != nil {
|
||||
return t, err
|
||||
return t, fmt.Errorf("Error retrieving polls for tweet %d:\n %w", t.ID, err)
|
||||
}
|
||||
t.Polls = polls
|
||||
|
||||
urls, err := p.GetUrlsForTweet(t)
|
||||
if err != nil {
|
||||
return t, fmt.Errorf("Error retrieving urls for tweet %d:\n %w", t.ID, err)
|
||||
}
|
||||
t.Urls = urls
|
||||
|
||||
return t, err
|
||||
return t, nil
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -77,6 +77,10 @@ func TestNoWorseningTweet(t *testing.T) {
|
||||
tweet.IsConversationScraped = true
|
||||
tweet.LastScrapedAt = scraper.TimestampFromUnix(1000)
|
||||
tweet.Text = "Yes text"
|
||||
tweet.NumLikes = 10
|
||||
tweet.NumRetweets = 11
|
||||
tweet.NumQuoteTweets = 12
|
||||
tweet.NumReplies = 13
|
||||
|
||||
// Save the tweet
|
||||
err := profile.SaveTweet(tweet)
|
||||
@ -90,6 +94,10 @@ func TestNoWorseningTweet(t *testing.T) {
|
||||
tweet.Text = ""
|
||||
err = profile.SaveTweet(tweet)
|
||||
require.NoError(err)
|
||||
tweet.NumLikes = 0
|
||||
tweet.NumRetweets = 0
|
||||
tweet.NumQuoteTweets = 0
|
||||
tweet.NumReplies = 0
|
||||
|
||||
// Reload the tweet
|
||||
new_tweet, err := profile.GetTweetById(tweet.ID)
|
||||
@ -100,6 +108,10 @@ func TestNoWorseningTweet(t *testing.T) {
|
||||
assert.True(new_tweet.IsConversationScraped, "Should have preserved is-conversation-scraped status")
|
||||
assert.Equal(int64(1000), new_tweet.LastScrapedAt.Unix(), "Should have preserved last-scraped-at time")
|
||||
assert.Equal(new_tweet.Text, "Yes text", "Text should not get clobbered if it becomes unavailable")
|
||||
assert.Equal(10, new_tweet.NumLikes)
|
||||
assert.Equal(11, new_tweet.NumRetweets)
|
||||
assert.Equal(12, new_tweet.NumQuoteTweets)
|
||||
assert.Equal(13, new_tweet.NumReplies)
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -166,8 +166,8 @@ func create_stable_tweet() scraper.Tweet {
|
||||
Images: []scraper.Image{
|
||||
create_image_from_id(-1),
|
||||
},
|
||||
Mentions: []scraper.UserHandle{},
|
||||
Hashtags: []string{},
|
||||
Mentions: scraper.CommaSeparatedList{},
|
||||
Hashtags: scraper.CommaSeparatedList{},
|
||||
Polls: []scraper.Poll{
|
||||
create_poll_from_id(-1),
|
||||
},
|
||||
@ -257,9 +257,9 @@ func create_dummy_tweet() scraper.Tweet {
|
||||
Videos: []scraper.Video{vid},
|
||||
Urls: []scraper.Url{url1, url2},
|
||||
Images: []scraper.Image{img1, img2},
|
||||
Mentions: []scraper.UserHandle{"mention1", "mention2"},
|
||||
ReplyMentions: []scraper.UserHandle{"replymention1", "replymention2"},
|
||||
Hashtags: []string{"hash1", "hash2"},
|
||||
Mentions: scraper.CommaSeparatedList{"mention1", "mention2"},
|
||||
ReplyMentions: scraper.CommaSeparatedList{"replymention1", "replymention2"},
|
||||
Hashtags: scraper.CommaSeparatedList{"hash1", "hash2"},
|
||||
Polls: []scraper.Poll{poll},
|
||||
Spaces: []scraper.Space{space},
|
||||
SpaceID: space_id,
|
||||
@ -278,9 +278,9 @@ func create_dummy_tombstone() scraper.Tweet {
|
||||
UserID: -1,
|
||||
TombstoneType: "deleted",
|
||||
IsStub: true,
|
||||
Mentions: []scraper.UserHandle{},
|
||||
ReplyMentions: []scraper.UserHandle{},
|
||||
Hashtags: []string{},
|
||||
Mentions: scraper.CommaSeparatedList{},
|
||||
ReplyMentions: scraper.CommaSeparatedList{},
|
||||
Hashtags: scraper.CommaSeparatedList{},
|
||||
Spaces: []scraper.Space{},
|
||||
}
|
||||
}
|
||||
|
@ -125,7 +125,7 @@ func TestAPIV2ParseTweetWithQuotedTweet(t *testing.T) {
|
||||
assert.Equal(TweetID(1485689207435710464), quoted_tweet.InReplyToID)
|
||||
assert.Equal(TweetID(0), quoted_tweet.QuotedTweetID)
|
||||
assert.Equal(1, len(quoted_tweet.ReplyMentions))
|
||||
assert.Contains(quoted_tweet.ReplyMentions, UserHandle("michaelmalice"))
|
||||
assert.Contains(quoted_tweet.ReplyMentions, "michaelmalice")
|
||||
assert.Equal(1, quoted_tweet.NumReplies)
|
||||
assert.Equal(12, quoted_tweet.NumLikes)
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"database/sql/driver"
|
||||
"fmt"
|
||||
log "github.com/sirupsen/logrus"
|
||||
"strings"
|
||||
@ -9,44 +10,64 @@ import (
|
||||
"offline_twitter/terminal_utils"
|
||||
)
|
||||
|
||||
const DEFAULT_MAX_REPLIES_EAGER_LOAD = 50
|
||||
|
||||
type TweetID int64
|
||||
|
||||
type CommaSeparatedList []string
|
||||
|
||||
func (l *CommaSeparatedList) Scan(src interface{}) error {
|
||||
*l = CommaSeparatedList{}
|
||||
switch src.(type) {
|
||||
case string:
|
||||
for _, v := range strings.Split(src.(string), ",") {
|
||||
if v != "" {
|
||||
*l = append(*l, v)
|
||||
}
|
||||
}
|
||||
default:
|
||||
panic("Should be a string")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
func (l CommaSeparatedList) Value() (driver.Value, error) {
|
||||
return strings.Join(l, ","), nil
|
||||
}
|
||||
|
||||
type Tweet struct {
|
||||
ID TweetID
|
||||
UserID UserID
|
||||
ID TweetID `db:"id"`
|
||||
UserID UserID `db:"user_id"`
|
||||
UserHandle UserHandle // For processing tombstones
|
||||
User *User
|
||||
Text string
|
||||
PostedAt Timestamp
|
||||
NumLikes int
|
||||
NumRetweets int
|
||||
NumReplies int
|
||||
NumQuoteTweets int
|
||||
InReplyToID TweetID
|
||||
QuotedTweetID TweetID
|
||||
Text string `db:"text"`
|
||||
IsExpandable bool `db:"is_expandable"`
|
||||
PostedAt Timestamp `db:"posted_at"`
|
||||
NumLikes int `db:"num_likes"`
|
||||
NumRetweets int `db:"num_retweets"`
|
||||
NumReplies int `db:"num_replies"`
|
||||
NumQuoteTweets int `db:"num_quote_tweets"`
|
||||
InReplyToID TweetID `db:"in_reply_to_id"`
|
||||
QuotedTweetID TweetID `db:"quoted_tweet_id"`
|
||||
|
||||
Images []Image
|
||||
Videos []Video
|
||||
Mentions []UserHandle
|
||||
ReplyMentions []UserHandle
|
||||
Hashtags []string
|
||||
Urls []Url
|
||||
Polls []Poll
|
||||
Mentions CommaSeparatedList `db:"mentions"`
|
||||
ReplyMentions CommaSeparatedList `db:"reply_mentions"`
|
||||
Hashtags CommaSeparatedList `db:"hashtags"`
|
||||
|
||||
// TODO get-rid-of-spaces: Might be good to get rid of `Spaces`. Only used in APIv1 I think.
|
||||
// A first-step would be to delete the Spaces after pulling them out of a Tweet into the Trove
|
||||
// in ParseTweetResponse. Then they will only be getting saved once rather than twice.
|
||||
Spaces []Space
|
||||
SpaceID SpaceID
|
||||
SpaceID SpaceID `db:"space_id"`
|
||||
|
||||
TombstoneType string
|
||||
IsStub bool
|
||||
TombstoneType string `db:"tombstone_type"`
|
||||
IsStub bool `db:"is_stub"`
|
||||
|
||||
IsContentDownloaded bool
|
||||
IsConversationScraped bool
|
||||
LastScrapedAt Timestamp
|
||||
IsContentDownloaded bool `db:"is_content_downloaded"`
|
||||
IsConversationScraped bool `db:"is_conversation_scraped"`
|
||||
LastScrapedAt Timestamp `db:"last_scraped_at"`
|
||||
}
|
||||
|
||||
func (t Tweet) String() string {
|
||||
@ -150,14 +171,14 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
||||
|
||||
// Process `@` mentions and reply-mentions
|
||||
for _, mention := range apiTweet.Entities.Mentions {
|
||||
ret.Mentions = append(ret.Mentions, UserHandle(mention.UserName))
|
||||
ret.Mentions = append(ret.Mentions, mention.UserName)
|
||||
}
|
||||
for _, mention := range strings.Split(apiTweet.Entities.ReplyMentions, " ") {
|
||||
if mention != "" {
|
||||
if mention[0] != '@' {
|
||||
panic(fmt.Errorf("Unknown ReplyMention value %q:\n %w", apiTweet.Entities.ReplyMentions, EXTERNAL_API_ERROR))
|
||||
}
|
||||
ret.ReplyMentions = append(ret.ReplyMentions, UserHandle(mention[1:]))
|
||||
ret.ReplyMentions = append(ret.ReplyMentions, mention[1:])
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -35,7 +35,7 @@ func TestParseSingleTweet(t *testing.T) {
|
||||
assert.Equal("The fact that @michaelmalice new book ‘The Anarchist Handbook’ is just absolutely destroying on the charts is the "+
|
||||
"largest white pill I’ve swallowed in years.", tweet.Text)
|
||||
assert.Len(tweet.Mentions, 1)
|
||||
assert.Contains(tweet.Mentions, UserHandle("michaelmalice"))
|
||||
assert.Contains(tweet.Mentions, "michaelmalice")
|
||||
assert.Empty(tweet.Urls)
|
||||
assert.Equal(int64(1621639105), tweet.PostedAt.Unix())
|
||||
assert.Zero(tweet.QuotedTweetID)
|
||||
@ -179,7 +179,7 @@ func TestTweetWithLotsOfReplyMentions(t *testing.T) {
|
||||
tweet := load_tweet_from_file("test_responses/single_tweets/tweet_with_at_mentions_in_front.json")
|
||||
assert.Len(tweet.ReplyMentions, 4)
|
||||
|
||||
for i, v := range []UserHandle{"rob_mose", "primalpoly", "jmasseypoet", "SpaceX"} {
|
||||
for i, v := range []string{"rob_mose", "primalpoly", "jmasseypoet", "SpaceX"} {
|
||||
assert.Equal(v, tweet.ReplyMentions[i])
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user