Persist tombstone data

This commit is contained in:
Alessio 2021-11-06 13:37:46 -07:00
parent e5b4b43358
commit 470dce1d27
8 changed files with 190 additions and 26 deletions

View File

@ -144,7 +144,7 @@ urls_count_after=$(sqlite3 twitter.db "select count(*) from urls")
test $urls_count_after = $(($urls_count + 1)) test $urls_count_after = $(($urls_count + 1))
test "$(sqlite3 twitter.db "select title from urls where tweet_id = 1428904664645394433")" = "Justice Department investigating Elon Musk's SpaceX following complaint of hiring discrimination" test "$(sqlite3 twitter.db "select title from urls where tweet_id = 1428904664645394433")" = "Justice Department investigating Elon Musk's SpaceX following complaint of hiring discrimination"
test $(sqlite3 twitter.db "select count(*) from urls where tweet_id = 1428904664645394433") = "1" test $(sqlite3 twitter.db "select count(*) from urls where tweet_id = 1428904664645394433") = "1"
thumbnail_name=$(sqlite3 twitter.db "select thumbnail_remote_url from urls where tweet_id = 1428904664645394433" | grep -Po "(?<=/)\w+(?=\?)") thumbnail_name=$(sqlite3 twitter.db "select thumbnail_remote_url from urls where tweet_id = 1428904664645394433" | grep -Po "(?<=/)[\w-]+(?=\?)")
test -n "$thumbnail_name" # Not testing for what the thumbnail url is because it keeps changing test -n "$thumbnail_name" # Not testing for what the thumbnail url is because it keeps changing
# Try to double-fetch it; shouldn't duplicate the URL # Try to double-fetch it; shouldn't duplicate the URL
@ -174,6 +174,12 @@ test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id
test $(find link_preview_images | wc -l) = $initial_link_preview_images_count # Should be the same test $(find link_preview_images | wc -l) = $initial_link_preview_images_count # Should be the same
# Test a tweet thread with tombstones
tw fetch_tweet https://twitter.com/CovfefeAnon/status/1454526270809726977
test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454515503242829830") = 1
test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454521424144654344") = 0
test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454522147750260742") = 1
# TODO: Maybe this file should be broken up into multiple test scripts # TODO: Maybe this file should be broken up into multiple test scripts

View File

@ -19,10 +19,18 @@ create table users (rowid integer primary key,
pinned_tweet_id integer check(typeof(pinned_tweet_id) = 'integer' or pinned_tweet_id = ''), pinned_tweet_id integer check(typeof(pinned_tweet_id) = 'integer' or pinned_tweet_id = ''),
is_content_downloaded boolean default 0 is_content_downloaded boolean default 0
-- foreign key(pinned_tweet_id) references tweets(id)
); );
create table tombstone_types (rowid integer primary key,
short_name text not null unique,
tombstone_text text not null unique
);
insert into tombstone_types(rowid, short_name, tombstone_text) values
(1, 'deleted', 'This Tweet was deleted by the Tweet author'),
(2, 'suspended', '???'),
(3, 'hidden', 'Youre unable to view this Tweet because this account owner limits who can view their Tweets'),
(4, 'unavailable', 'This Tweet is unavailable');
create table tweets (rowid integer primary key, create table tweets (rowid integer primary key,
id integer unique not null check(typeof(id) = 'integer'), id integer unique not null check(typeof(id) = 'integer'),
user_id integer not null check(typeof(user_id) = 'integer'), user_id integer not null check(typeof(user_id) = 'integer'),
@ -37,11 +45,11 @@ create table tweets (rowid integer primary key,
mentions text, -- comma-separated mentions text, -- comma-separated
reply_mentions text, -- comma-separated reply_mentions text, -- comma-separated
hashtags text, -- comma-separated hashtags text, -- comma-separated
tombstone_type integer default 0,
is_stub boolean default 0,
is_content_downloaded boolean default 0, is_content_downloaded boolean default 0,
foreign key(user_id) references users(id) foreign key(user_id) references users(id)
-- foreign key(in_reply_to) references tweets(id),
-- foreign key(quoted_tweet) references tweets(id)
); );
create table retweets(rowid integer primary key, create table retweets(rowid integer primary key,
@ -71,8 +79,6 @@ create table urls (rowid integer primary key,
unique (tweet_id, text) unique (tweet_id, text)
foreign key(tweet_id) references tweets(id) foreign key(tweet_id) references tweets(id)
-- foreign key(creator_id) references users(id)
-- foreign key(site_id) references users(id)
); );
create table images (rowid integer primary key, create table images (rowid integer primary key,

View File

@ -16,17 +16,18 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
return err return err
} }
_, err = db.Exec(` _, err = db.Exec(`
insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, is_content_downloaded) insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, tombstone_type, is_stub, is_content_downloaded)
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, (select rowid from tombstone_types where short_name=?), ?, ?)
on conflict do update on conflict do update
set num_likes=?, set num_likes=?,
num_retweets=?, num_retweets=?,
num_replies=?, num_replies=?,
num_quote_tweets=?, num_quote_tweets=?,
is_stub=(is_stub and ?),
is_content_downloaded=(is_content_downloaded or ?) is_content_downloaded=(is_content_downloaded or ?)
`, `,
t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyTo, t.QuotedTweet, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions), strings.Join(t.Hashtags, ","), t.IsContentDownloaded, t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyTo, t.QuotedTweet, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions), strings.Join(t.Hashtags, ","), t.TombstoneType, t.IsStub, t.IsContentDownloaded,
t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsContentDownloaded, t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsStub, t.IsContentDownloaded,
) )
if err != nil { if err != nil {
@ -83,8 +84,8 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
db := p.DB db := p.DB
stmt, err := db.Prepare(` stmt, err := db.Prepare(`
select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, is_content_downloaded select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, ifnull(tombstone_types.short_name, ""), is_stub, is_content_downloaded
from tweets from tweets left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid
where id = ? where id = ?
`) `)
@ -100,19 +101,30 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
var hashtags string var hashtags string
row := stmt.QueryRow(id) row := stmt.QueryRow(id)
err = row.Scan(&t.ID, &t.UserID, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyTo, &t.QuotedTweet, &mentions, &reply_mentions, &hashtags, &t.IsContentDownloaded) err = row.Scan(&t.ID, &t.UserID, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyTo, &t.QuotedTweet, &mentions, &reply_mentions, &hashtags, &t.TombstoneType, &t.IsStub, &t.IsContentDownloaded)
if err != nil { if err != nil {
return t, err return t, err
} }
t.PostedAt = time.Unix(int64(postedAt), 0) // args are `seconds` and `nanoseconds` t.PostedAt = time.Unix(int64(postedAt), 0) // args are `seconds` and `nanoseconds`
t.Mentions = []scraper.UserHandle{}
for _, m := range strings.Split(mentions, ",") { for _, m := range strings.Split(mentions, ",") {
if m != "" {
t.Mentions = append(t.Mentions, scraper.UserHandle(m)) t.Mentions = append(t.Mentions, scraper.UserHandle(m))
} }
}
t.ReplyMentions = []scraper.UserHandle{}
for _, m := range strings.Split(reply_mentions, ",") { for _, m := range strings.Split(reply_mentions, ",") {
if m != "" {
t.ReplyMentions = append(t.ReplyMentions, scraper.UserHandle(m)) t.ReplyMentions = append(t.ReplyMentions, scraper.UserHandle(m))
} }
t.Hashtags = strings.Split(hashtags, ",") }
t.Hashtags = []string{}
for _, h := range strings.Split(hashtags, ",") {
if h != "" {
t.Hashtags = append(t.Hashtags, h)
}
}
imgs, err := p.GetImagesForTweet(t) imgs, err := p.GetImagesForTweet(t)
if err != nil { if err != nil {

View File

@ -34,6 +34,76 @@ func TestSaveAndLoadTweet(t *testing.T) {
} }
} }
/**
* Same as above, but with a tombstone
*/
func TestSaveAndLoadTombstone(t *testing.T) {
profile_path := "test_profiles/TestTweetQueries"
profile := create_or_load_profile(profile_path)
tweet := create_dummy_tombstone()
// Save the tweet
err := profile.SaveTweet(tweet)
if err != nil {
t.Fatalf("Failed to save the tweet: %s", err.Error())
}
// Reload the tweet
new_tweet, err := profile.GetTweetById(tweet.ID)
if err != nil {
t.Fatalf("Failed to load the tweet: %s", err.Error())
}
if diff := deep.Equal(tweet, new_tweet); diff != nil {
t.Error(diff)
}
}
/**
* Saving a tweet that already exists shouldn't reduce its backed-up status.
* i.e., content which is already saved shouldn't be marked un-saved if it's removed from Twitter.
* After all, that's the whole point of archiving.
*
* - is_stub should only go from "yes" to "no"
* - is_content_downloaded should only go from "no" to "yes"
*/
func TestNoWorseningTweet(t *testing.T) {
profile_path := "test_profiles/TestTweetQueries"
profile := create_or_load_profile(profile_path)
tweet := create_dummy_tweet()
tweet.IsContentDownloaded = true
tweet.IsStub = false
// Save the tweet
err := profile.SaveTweet(tweet)
if err != nil {
t.Fatalf("Failed to save the tweet: %s", err.Error())
}
// Worsen the tweet and re-save it
tweet.IsContentDownloaded = false
tweet.IsStub = true
err = profile.SaveTweet(tweet)
if err != nil {
t.Fatalf("Failed to save the tweet: %s", err.Error())
}
// Reload the tweet
new_tweet, err := profile.GetTweetById(tweet.ID)
if err != nil {
t.Fatalf("Failed to load the tweet: %s", err.Error())
}
if new_tweet.IsStub != false {
t.Errorf("Should have preserved non-stub status")
}
if new_tweet.IsContentDownloaded != true {
t.Errorf("Should have preserved is-content-downloaded status")
}
}
/** /**
* Should correctly report whether the User exists in the database * Should correctly report whether the User exists in the database
*/ */

View File

@ -225,6 +225,24 @@ func create_dummy_tweet() scraper.Tweet {
} }
} }
/**
* Create a random tombstone
*/
func create_dummy_tombstone() scraper.Tweet {
rand.Seed(time.Now().UnixNano())
tweet_id := scraper.TweetID(rand.Int())
return scraper.Tweet{
ID: tweet_id,
UserID: -1,
TombstoneType: "deleted",
IsStub: true,
Mentions: []scraper.UserHandle{},
ReplyMentions: []scraper.UserHandle{},
Hashtags: []string{},
}
}
/** /**
* Create a new retweet with a random ID for a given TweetID * Create a new retweet with a random ID for a given TweetID
*/ */

View File

@ -32,6 +32,9 @@ type Tweet struct {
Hashtags []string Hashtags []string
QuotedTweet TweetID QuotedTweet TweetID
TombstoneType string
IsStub bool
IsContentDownloaded bool IsContentDownloaded bool
} }
@ -81,10 +84,13 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
ret.UserID = UserID(apiTweet.UserID) ret.UserID = UserID(apiTweet.UserID)
ret.Text = apiTweet.FullText ret.Text = apiTweet.FullText
if apiTweet.TombstoneText == "" { // Skip time parsing for tombstones
ret.PostedAt, err = time.Parse(time.RubyDate, apiTweet.CreatedAt) ret.PostedAt, err = time.Parse(time.RubyDate, apiTweet.CreatedAt)
if err != nil { if err != nil {
return return
} }
}
ret.NumLikes = apiTweet.FavoriteCount ret.NumLikes = apiTweet.FavoriteCount
ret.NumRetweets = apiTweet.RetweetCount ret.NumRetweets = apiTweet.RetweetCount
ret.NumReplies = apiTweet.ReplyCount ret.NumReplies = apiTweet.ReplyCount
@ -139,6 +145,10 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
ret.Videos = []Video{new_video} ret.Videos = []Video{new_video}
ret.Images = []Image{} ret.Images = []Image{}
} }
ret.TombstoneType = apiTweet.TombstoneText
ret.IsStub = !(ret.TombstoneType == "")
return return
} }
@ -190,8 +200,20 @@ func GetTweetFull(id TweetID) (tweets []Tweet, retweets []Retweet, users []User,
return return
} }
} }
tombstone_users := tweet_response.HandleTombstones()
return ParseTweetResponse(tweet_response) fmt.Printf("%v\n", tombstone_users)
for _, u := range tombstone_users {
fetched_user, err1 := GetUser(UserHandle(u))
if err != nil {
err = err1
return
}
fmt.Println(fetched_user)
users = append(users, fetched_user)
}
tweets, retweets, _users, err := ParseTweetResponse(tweet_response)
users = append(users, _users...)
return
} }
/** /**

View File

@ -209,9 +209,40 @@ func TestParseTweetResponse(t *testing.T) {
t.Errorf("Expected %d tweets, got %d", 29-3, len(tweets)) t.Errorf("Expected %d tweets, got %d", 29-3, len(tweets))
} }
if len(retweets) != 3 { if len(retweets) != 3 {
t.Errorf("Expected %d tweets, got %d", 3, len(retweets)) t.Errorf("Expected %d retweets, got %d", 3, len(retweets))
} }
if len(users) != 9 { if len(users) != 9 {
t.Errorf("Expected %d tweets, got %d", 9, len(users)) t.Errorf("Expected %d users, got %d", 9, len(users))
}
}
func TestParseTweetResponseWithTombstones(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/tombstones/tombstone_deleted.json")
if err != nil {
panic(err)
}
var tweet_resp scraper.TweetResponse
err = json.Unmarshal(data, &tweet_resp)
if err != nil {
t.Errorf(err.Error())
}
extra_users := tweet_resp.HandleTombstones()
if len(extra_users) != 1 {
t.Errorf("Expected to need 1 extra user but got %d instead", len(extra_users))
}
tweets, retweets, users, err := scraper.ParseTweetResponse(tweet_resp)
if err != nil {
t.Fatal(err)
}
if len(tweets) != 2 {
t.Errorf("Expected %d tweets, got %d", 2, len(tweets))
}
if len(retweets) != 0 {
t.Errorf("Expected %d retweets, got %d", 0, len(retweets))
}
if len(users) != 1 {
t.Errorf("Expected %d users, got %d", 1, len(users))
} }
} }

View File

@ -18,8 +18,7 @@ func GetUserFeedFor(user_id UserID, min_tweets int) (tweets []Tweet, retweets []
return return
} }
if len(tweet_response.GlobalObjects.Tweets) < min_tweets && if len(tweet_response.GlobalObjects.Tweets) < min_tweets && tweet_response.GetCursor() != "" {
tweet_response.GetCursor() != "" {
err = api.GetMoreTweetsFromFeed(user_id, &tweet_response, min_tweets) err = api.GetMoreTweetsFromFeed(user_id, &tweet_response, min_tweets)
if err != nil && err != END_OF_FEED { if err != nil && err != END_OF_FEED {
return return