Persist tombstone data
This commit is contained in:
parent
e5b4b43358
commit
470dce1d27
@ -144,7 +144,7 @@ urls_count_after=$(sqlite3 twitter.db "select count(*) from urls")
|
||||
test $urls_count_after = $(($urls_count + 1))
|
||||
test "$(sqlite3 twitter.db "select title from urls where tweet_id = 1428904664645394433")" = "Justice Department investigating Elon Musk's SpaceX following complaint of hiring discrimination"
|
||||
test $(sqlite3 twitter.db "select count(*) from urls where tweet_id = 1428904664645394433") = "1"
|
||||
thumbnail_name=$(sqlite3 twitter.db "select thumbnail_remote_url from urls where tweet_id = 1428904664645394433" | grep -Po "(?<=/)\w+(?=\?)")
|
||||
thumbnail_name=$(sqlite3 twitter.db "select thumbnail_remote_url from urls where tweet_id = 1428904664645394433" | grep -Po "(?<=/)[\w-]+(?=\?)")
|
||||
test -n "$thumbnail_name" # Not testing for what the thumbnail url is because it keeps changing
|
||||
|
||||
# Try to double-fetch it; shouldn't duplicate the URL
|
||||
@ -174,6 +174,12 @@ test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id
|
||||
test $(find link_preview_images | wc -l) = $initial_link_preview_images_count # Should be the same
|
||||
|
||||
|
||||
# Test a tweet thread with tombstones
|
||||
tw fetch_tweet https://twitter.com/CovfefeAnon/status/1454526270809726977
|
||||
test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454515503242829830") = 1
|
||||
test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454521424144654344") = 0
|
||||
test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454522147750260742") = 1
|
||||
|
||||
|
||||
# TODO: Maybe this file should be broken up into multiple test scripts
|
||||
|
||||
|
@ -19,10 +19,18 @@ create table users (rowid integer primary key,
|
||||
pinned_tweet_id integer check(typeof(pinned_tweet_id) = 'integer' or pinned_tweet_id = ''),
|
||||
|
||||
is_content_downloaded boolean default 0
|
||||
|
||||
-- foreign key(pinned_tweet_id) references tweets(id)
|
||||
);
|
||||
|
||||
create table tombstone_types (rowid integer primary key,
|
||||
short_name text not null unique,
|
||||
tombstone_text text not null unique
|
||||
);
|
||||
insert into tombstone_types(rowid, short_name, tombstone_text) values
|
||||
(1, 'deleted', 'This Tweet was deleted by the Tweet author'),
|
||||
(2, 'suspended', '???'),
|
||||
(3, 'hidden', 'You’re unable to view this Tweet because this account owner limits who can view their Tweets'),
|
||||
(4, 'unavailable', 'This Tweet is unavailable');
|
||||
|
||||
create table tweets (rowid integer primary key,
|
||||
id integer unique not null check(typeof(id) = 'integer'),
|
||||
user_id integer not null check(typeof(user_id) = 'integer'),
|
||||
@ -37,11 +45,11 @@ create table tweets (rowid integer primary key,
|
||||
mentions text, -- comma-separated
|
||||
reply_mentions text, -- comma-separated
|
||||
hashtags text, -- comma-separated
|
||||
tombstone_type integer default 0,
|
||||
is_stub boolean default 0,
|
||||
|
||||
is_content_downloaded boolean default 0,
|
||||
foreign key(user_id) references users(id)
|
||||
-- foreign key(in_reply_to) references tweets(id),
|
||||
-- foreign key(quoted_tweet) references tweets(id)
|
||||
);
|
||||
|
||||
create table retweets(rowid integer primary key,
|
||||
@ -71,8 +79,6 @@ create table urls (rowid integer primary key,
|
||||
|
||||
unique (tweet_id, text)
|
||||
foreign key(tweet_id) references tweets(id)
|
||||
-- foreign key(creator_id) references users(id)
|
||||
-- foreign key(site_id) references users(id)
|
||||
);
|
||||
|
||||
create table images (rowid integer primary key,
|
||||
|
@ -16,17 +16,18 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
|
||||
return err
|
||||
}
|
||||
_, err = db.Exec(`
|
||||
insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, is_content_downloaded)
|
||||
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, tombstone_type, is_stub, is_content_downloaded)
|
||||
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, (select rowid from tombstone_types where short_name=?), ?, ?)
|
||||
on conflict do update
|
||||
set num_likes=?,
|
||||
num_retweets=?,
|
||||
num_replies=?,
|
||||
num_quote_tweets=?,
|
||||
is_stub=(is_stub and ?),
|
||||
is_content_downloaded=(is_content_downloaded or ?)
|
||||
`,
|
||||
t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyTo, t.QuotedTweet, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions), strings.Join(t.Hashtags, ","), t.IsContentDownloaded,
|
||||
t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsContentDownloaded,
|
||||
t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyTo, t.QuotedTweet, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions), strings.Join(t.Hashtags, ","), t.TombstoneType, t.IsStub, t.IsContentDownloaded,
|
||||
t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsStub, t.IsContentDownloaded,
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
@ -83,8 +84,8 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
|
||||
db := p.DB
|
||||
|
||||
stmt, err := db.Prepare(`
|
||||
select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, is_content_downloaded
|
||||
from tweets
|
||||
select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, ifnull(tombstone_types.short_name, ""), is_stub, is_content_downloaded
|
||||
from tweets left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid
|
||||
where id = ?
|
||||
`)
|
||||
|
||||
@ -100,19 +101,30 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
|
||||
var hashtags string
|
||||
|
||||
row := stmt.QueryRow(id)
|
||||
err = row.Scan(&t.ID, &t.UserID, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyTo, &t.QuotedTweet, &mentions, &reply_mentions, &hashtags, &t.IsContentDownloaded)
|
||||
err = row.Scan(&t.ID, &t.UserID, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyTo, &t.QuotedTweet, &mentions, &reply_mentions, &hashtags, &t.TombstoneType, &t.IsStub, &t.IsContentDownloaded)
|
||||
if err != nil {
|
||||
return t, err
|
||||
}
|
||||
|
||||
t.PostedAt = time.Unix(int64(postedAt), 0) // args are `seconds` and `nanoseconds`
|
||||
t.Mentions = []scraper.UserHandle{}
|
||||
for _, m := range strings.Split(mentions, ",") {
|
||||
t.Mentions = append(t.Mentions, scraper.UserHandle(m))
|
||||
if m != "" {
|
||||
t.Mentions = append(t.Mentions, scraper.UserHandle(m))
|
||||
}
|
||||
}
|
||||
t.ReplyMentions = []scraper.UserHandle{}
|
||||
for _, m := range strings.Split(reply_mentions, ",") {
|
||||
t.ReplyMentions = append(t.ReplyMentions, scraper.UserHandle(m))
|
||||
if m != "" {
|
||||
t.ReplyMentions = append(t.ReplyMentions, scraper.UserHandle(m))
|
||||
}
|
||||
}
|
||||
t.Hashtags = []string{}
|
||||
for _, h := range strings.Split(hashtags, ",") {
|
||||
if h != "" {
|
||||
t.Hashtags = append(t.Hashtags, h)
|
||||
}
|
||||
}
|
||||
t.Hashtags = strings.Split(hashtags, ",")
|
||||
|
||||
imgs, err := p.GetImagesForTweet(t)
|
||||
if err != nil {
|
||||
|
@ -34,6 +34,76 @@ func TestSaveAndLoadTweet(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Same as above, but with a tombstone
|
||||
*/
|
||||
func TestSaveAndLoadTombstone(t *testing.T) {
|
||||
profile_path := "test_profiles/TestTweetQueries"
|
||||
profile := create_or_load_profile(profile_path)
|
||||
|
||||
tweet := create_dummy_tombstone()
|
||||
|
||||
// Save the tweet
|
||||
err := profile.SaveTweet(tweet)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to save the tweet: %s", err.Error())
|
||||
}
|
||||
|
||||
// Reload the tweet
|
||||
new_tweet, err := profile.GetTweetById(tweet.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to load the tweet: %s", err.Error())
|
||||
}
|
||||
|
||||
if diff := deep.Equal(tweet, new_tweet); diff != nil {
|
||||
t.Error(diff)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Saving a tweet that already exists shouldn't reduce its backed-up status.
|
||||
* i.e., content which is already saved shouldn't be marked un-saved if it's removed from Twitter.
|
||||
* After all, that's the whole point of archiving.
|
||||
*
|
||||
* - is_stub should only go from "yes" to "no"
|
||||
* - is_content_downloaded should only go from "no" to "yes"
|
||||
*/
|
||||
func TestNoWorseningTweet(t *testing.T) {
|
||||
profile_path := "test_profiles/TestTweetQueries"
|
||||
profile := create_or_load_profile(profile_path)
|
||||
|
||||
tweet := create_dummy_tweet()
|
||||
tweet.IsContentDownloaded = true
|
||||
tweet.IsStub = false
|
||||
|
||||
// Save the tweet
|
||||
err := profile.SaveTweet(tweet)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to save the tweet: %s", err.Error())
|
||||
}
|
||||
|
||||
// Worsen the tweet and re-save it
|
||||
tweet.IsContentDownloaded = false
|
||||
tweet.IsStub = true
|
||||
err = profile.SaveTweet(tweet)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to save the tweet: %s", err.Error())
|
||||
}
|
||||
|
||||
// Reload the tweet
|
||||
new_tweet, err := profile.GetTweetById(tweet.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to load the tweet: %s", err.Error())
|
||||
}
|
||||
|
||||
if new_tweet.IsStub != false {
|
||||
t.Errorf("Should have preserved non-stub status")
|
||||
}
|
||||
if new_tweet.IsContentDownloaded != true {
|
||||
t.Errorf("Should have preserved is-content-downloaded status")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Should correctly report whether the User exists in the database
|
||||
*/
|
||||
|
@ -225,6 +225,24 @@ func create_dummy_tweet() scraper.Tweet {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a random tombstone
|
||||
*/
|
||||
func create_dummy_tombstone() scraper.Tweet {
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
tweet_id := scraper.TweetID(rand.Int())
|
||||
|
||||
return scraper.Tweet{
|
||||
ID: tweet_id,
|
||||
UserID: -1,
|
||||
TombstoneType: "deleted",
|
||||
IsStub: true,
|
||||
Mentions: []scraper.UserHandle{},
|
||||
ReplyMentions: []scraper.UserHandle{},
|
||||
Hashtags: []string{},
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new retweet with a random ID for a given TweetID
|
||||
*/
|
||||
|
@ -32,6 +32,9 @@ type Tweet struct {
|
||||
Hashtags []string
|
||||
QuotedTweet TweetID
|
||||
|
||||
TombstoneType string
|
||||
IsStub bool
|
||||
|
||||
IsContentDownloaded bool
|
||||
}
|
||||
|
||||
@ -81,10 +84,13 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
||||
ret.UserID = UserID(apiTweet.UserID)
|
||||
ret.Text = apiTweet.FullText
|
||||
|
||||
ret.PostedAt, err = time.Parse(time.RubyDate, apiTweet.CreatedAt)
|
||||
if err != nil {
|
||||
return
|
||||
if apiTweet.TombstoneText == "" { // Skip time parsing for tombstones
|
||||
ret.PostedAt, err = time.Parse(time.RubyDate, apiTweet.CreatedAt)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
ret.NumLikes = apiTweet.FavoriteCount
|
||||
ret.NumRetweets = apiTweet.RetweetCount
|
||||
ret.NumReplies = apiTweet.ReplyCount
|
||||
@ -139,6 +145,10 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
||||
ret.Videos = []Video{new_video}
|
||||
ret.Images = []Image{}
|
||||
}
|
||||
|
||||
ret.TombstoneType = apiTweet.TombstoneText
|
||||
ret.IsStub = !(ret.TombstoneType == "")
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
@ -190,8 +200,20 @@ func GetTweetFull(id TweetID) (tweets []Tweet, retweets []Retweet, users []User,
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
return ParseTweetResponse(tweet_response)
|
||||
tombstone_users := tweet_response.HandleTombstones()
|
||||
fmt.Printf("%v\n", tombstone_users)
|
||||
for _, u := range tombstone_users {
|
||||
fetched_user, err1 := GetUser(UserHandle(u))
|
||||
if err != nil {
|
||||
err = err1
|
||||
return
|
||||
}
|
||||
fmt.Println(fetched_user)
|
||||
users = append(users, fetched_user)
|
||||
}
|
||||
tweets, retweets, _users, err := ParseTweetResponse(tweet_response)
|
||||
users = append(users, _users...)
|
||||
return
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -209,9 +209,40 @@ func TestParseTweetResponse(t *testing.T) {
|
||||
t.Errorf("Expected %d tweets, got %d", 29-3, len(tweets))
|
||||
}
|
||||
if len(retweets) != 3 {
|
||||
t.Errorf("Expected %d tweets, got %d", 3, len(retweets))
|
||||
t.Errorf("Expected %d retweets, got %d", 3, len(retweets))
|
||||
}
|
||||
if len(users) != 9 {
|
||||
t.Errorf("Expected %d tweets, got %d", 9, len(users))
|
||||
t.Errorf("Expected %d users, got %d", 9, len(users))
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseTweetResponseWithTombstones(t *testing.T) {
|
||||
data, err := ioutil.ReadFile("test_responses/tombstones/tombstone_deleted.json")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
var tweet_resp scraper.TweetResponse
|
||||
err = json.Unmarshal(data, &tweet_resp)
|
||||
if err != nil {
|
||||
t.Errorf(err.Error())
|
||||
}
|
||||
extra_users := tweet_resp.HandleTombstones()
|
||||
if len(extra_users) != 1 {
|
||||
t.Errorf("Expected to need 1 extra user but got %d instead", len(extra_users))
|
||||
}
|
||||
|
||||
tweets, retweets, users, err := scraper.ParseTweetResponse(tweet_resp)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if len(tweets) != 2 {
|
||||
t.Errorf("Expected %d tweets, got %d", 2, len(tweets))
|
||||
}
|
||||
if len(retweets) != 0 {
|
||||
t.Errorf("Expected %d retweets, got %d", 0, len(retweets))
|
||||
}
|
||||
if len(users) != 1 {
|
||||
t.Errorf("Expected %d users, got %d", 1, len(users))
|
||||
}
|
||||
}
|
||||
|
@ -18,8 +18,7 @@ func GetUserFeedFor(user_id UserID, min_tweets int) (tweets []Tweet, retweets []
|
||||
return
|
||||
}
|
||||
|
||||
if len(tweet_response.GlobalObjects.Tweets) < min_tweets &&
|
||||
tweet_response.GetCursor() != "" {
|
||||
if len(tweet_response.GlobalObjects.Tweets) < min_tweets && tweet_response.GetCursor() != "" {
|
||||
err = api.GetMoreTweetsFromFeed(user_id, &tweet_response, min_tweets)
|
||||
if err != nil && err != END_OF_FEED {
|
||||
return
|
||||
|
Loading…
x
Reference in New Issue
Block a user