Persist tombstone data
This commit is contained in:
parent
e5b4b43358
commit
470dce1d27
@ -144,7 +144,7 @@ urls_count_after=$(sqlite3 twitter.db "select count(*) from urls")
|
|||||||
test $urls_count_after = $(($urls_count + 1))
|
test $urls_count_after = $(($urls_count + 1))
|
||||||
test "$(sqlite3 twitter.db "select title from urls where tweet_id = 1428904664645394433")" = "Justice Department investigating Elon Musk's SpaceX following complaint of hiring discrimination"
|
test "$(sqlite3 twitter.db "select title from urls where tweet_id = 1428904664645394433")" = "Justice Department investigating Elon Musk's SpaceX following complaint of hiring discrimination"
|
||||||
test $(sqlite3 twitter.db "select count(*) from urls where tweet_id = 1428904664645394433") = "1"
|
test $(sqlite3 twitter.db "select count(*) from urls where tweet_id = 1428904664645394433") = "1"
|
||||||
thumbnail_name=$(sqlite3 twitter.db "select thumbnail_remote_url from urls where tweet_id = 1428904664645394433" | grep -Po "(?<=/)\w+(?=\?)")
|
thumbnail_name=$(sqlite3 twitter.db "select thumbnail_remote_url from urls where tweet_id = 1428904664645394433" | grep -Po "(?<=/)[\w-]+(?=\?)")
|
||||||
test -n "$thumbnail_name" # Not testing for what the thumbnail url is because it keeps changing
|
test -n "$thumbnail_name" # Not testing for what the thumbnail url is because it keeps changing
|
||||||
|
|
||||||
# Try to double-fetch it; shouldn't duplicate the URL
|
# Try to double-fetch it; shouldn't duplicate the URL
|
||||||
@ -174,6 +174,12 @@ test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id
|
|||||||
test $(find link_preview_images | wc -l) = $initial_link_preview_images_count # Should be the same
|
test $(find link_preview_images | wc -l) = $initial_link_preview_images_count # Should be the same
|
||||||
|
|
||||||
|
|
||||||
|
# Test a tweet thread with tombstones
|
||||||
|
tw fetch_tweet https://twitter.com/CovfefeAnon/status/1454526270809726977
|
||||||
|
test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454515503242829830") = 1
|
||||||
|
test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454521424144654344") = 0
|
||||||
|
test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454522147750260742") = 1
|
||||||
|
|
||||||
|
|
||||||
# TODO: Maybe this file should be broken up into multiple test scripts
|
# TODO: Maybe this file should be broken up into multiple test scripts
|
||||||
|
|
||||||
|
@ -19,10 +19,18 @@ create table users (rowid integer primary key,
|
|||||||
pinned_tweet_id integer check(typeof(pinned_tweet_id) = 'integer' or pinned_tweet_id = ''),
|
pinned_tweet_id integer check(typeof(pinned_tweet_id) = 'integer' or pinned_tweet_id = ''),
|
||||||
|
|
||||||
is_content_downloaded boolean default 0
|
is_content_downloaded boolean default 0
|
||||||
|
|
||||||
-- foreign key(pinned_tweet_id) references tweets(id)
|
|
||||||
);
|
);
|
||||||
|
|
||||||
|
create table tombstone_types (rowid integer primary key,
|
||||||
|
short_name text not null unique,
|
||||||
|
tombstone_text text not null unique
|
||||||
|
);
|
||||||
|
insert into tombstone_types(rowid, short_name, tombstone_text) values
|
||||||
|
(1, 'deleted', 'This Tweet was deleted by the Tweet author'),
|
||||||
|
(2, 'suspended', '???'),
|
||||||
|
(3, 'hidden', 'You’re unable to view this Tweet because this account owner limits who can view their Tweets'),
|
||||||
|
(4, 'unavailable', 'This Tweet is unavailable');
|
||||||
|
|
||||||
create table tweets (rowid integer primary key,
|
create table tweets (rowid integer primary key,
|
||||||
id integer unique not null check(typeof(id) = 'integer'),
|
id integer unique not null check(typeof(id) = 'integer'),
|
||||||
user_id integer not null check(typeof(user_id) = 'integer'),
|
user_id integer not null check(typeof(user_id) = 'integer'),
|
||||||
@ -37,11 +45,11 @@ create table tweets (rowid integer primary key,
|
|||||||
mentions text, -- comma-separated
|
mentions text, -- comma-separated
|
||||||
reply_mentions text, -- comma-separated
|
reply_mentions text, -- comma-separated
|
||||||
hashtags text, -- comma-separated
|
hashtags text, -- comma-separated
|
||||||
|
tombstone_type integer default 0,
|
||||||
|
is_stub boolean default 0,
|
||||||
|
|
||||||
is_content_downloaded boolean default 0,
|
is_content_downloaded boolean default 0,
|
||||||
foreign key(user_id) references users(id)
|
foreign key(user_id) references users(id)
|
||||||
-- foreign key(in_reply_to) references tweets(id),
|
|
||||||
-- foreign key(quoted_tweet) references tweets(id)
|
|
||||||
);
|
);
|
||||||
|
|
||||||
create table retweets(rowid integer primary key,
|
create table retweets(rowid integer primary key,
|
||||||
@ -71,8 +79,6 @@ create table urls (rowid integer primary key,
|
|||||||
|
|
||||||
unique (tweet_id, text)
|
unique (tweet_id, text)
|
||||||
foreign key(tweet_id) references tweets(id)
|
foreign key(tweet_id) references tweets(id)
|
||||||
-- foreign key(creator_id) references users(id)
|
|
||||||
-- foreign key(site_id) references users(id)
|
|
||||||
);
|
);
|
||||||
|
|
||||||
create table images (rowid integer primary key,
|
create table images (rowid integer primary key,
|
||||||
|
@ -16,17 +16,18 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
_, err = db.Exec(`
|
_, err = db.Exec(`
|
||||||
insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, is_content_downloaded)
|
insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, tombstone_type, is_stub, is_content_downloaded)
|
||||||
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, (select rowid from tombstone_types where short_name=?), ?, ?)
|
||||||
on conflict do update
|
on conflict do update
|
||||||
set num_likes=?,
|
set num_likes=?,
|
||||||
num_retweets=?,
|
num_retweets=?,
|
||||||
num_replies=?,
|
num_replies=?,
|
||||||
num_quote_tweets=?,
|
num_quote_tweets=?,
|
||||||
|
is_stub=(is_stub and ?),
|
||||||
is_content_downloaded=(is_content_downloaded or ?)
|
is_content_downloaded=(is_content_downloaded or ?)
|
||||||
`,
|
`,
|
||||||
t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyTo, t.QuotedTweet, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions), strings.Join(t.Hashtags, ","), t.IsContentDownloaded,
|
t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyTo, t.QuotedTweet, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions), strings.Join(t.Hashtags, ","), t.TombstoneType, t.IsStub, t.IsContentDownloaded,
|
||||||
t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsContentDownloaded,
|
t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsStub, t.IsContentDownloaded,
|
||||||
)
|
)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -83,8 +84,8 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
|
|||||||
db := p.DB
|
db := p.DB
|
||||||
|
|
||||||
stmt, err := db.Prepare(`
|
stmt, err := db.Prepare(`
|
||||||
select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, is_content_downloaded
|
select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, ifnull(tombstone_types.short_name, ""), is_stub, is_content_downloaded
|
||||||
from tweets
|
from tweets left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid
|
||||||
where id = ?
|
where id = ?
|
||||||
`)
|
`)
|
||||||
|
|
||||||
@ -100,19 +101,30 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
|
|||||||
var hashtags string
|
var hashtags string
|
||||||
|
|
||||||
row := stmt.QueryRow(id)
|
row := stmt.QueryRow(id)
|
||||||
err = row.Scan(&t.ID, &t.UserID, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyTo, &t.QuotedTweet, &mentions, &reply_mentions, &hashtags, &t.IsContentDownloaded)
|
err = row.Scan(&t.ID, &t.UserID, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyTo, &t.QuotedTweet, &mentions, &reply_mentions, &hashtags, &t.TombstoneType, &t.IsStub, &t.IsContentDownloaded)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return t, err
|
return t, err
|
||||||
}
|
}
|
||||||
|
|
||||||
t.PostedAt = time.Unix(int64(postedAt), 0) // args are `seconds` and `nanoseconds`
|
t.PostedAt = time.Unix(int64(postedAt), 0) // args are `seconds` and `nanoseconds`
|
||||||
|
t.Mentions = []scraper.UserHandle{}
|
||||||
for _, m := range strings.Split(mentions, ",") {
|
for _, m := range strings.Split(mentions, ",") {
|
||||||
|
if m != "" {
|
||||||
t.Mentions = append(t.Mentions, scraper.UserHandle(m))
|
t.Mentions = append(t.Mentions, scraper.UserHandle(m))
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
t.ReplyMentions = []scraper.UserHandle{}
|
||||||
for _, m := range strings.Split(reply_mentions, ",") {
|
for _, m := range strings.Split(reply_mentions, ",") {
|
||||||
|
if m != "" {
|
||||||
t.ReplyMentions = append(t.ReplyMentions, scraper.UserHandle(m))
|
t.ReplyMentions = append(t.ReplyMentions, scraper.UserHandle(m))
|
||||||
}
|
}
|
||||||
t.Hashtags = strings.Split(hashtags, ",")
|
}
|
||||||
|
t.Hashtags = []string{}
|
||||||
|
for _, h := range strings.Split(hashtags, ",") {
|
||||||
|
if h != "" {
|
||||||
|
t.Hashtags = append(t.Hashtags, h)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
imgs, err := p.GetImagesForTweet(t)
|
imgs, err := p.GetImagesForTweet(t)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -34,6 +34,76 @@ func TestSaveAndLoadTweet(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Same as above, but with a tombstone
|
||||||
|
*/
|
||||||
|
func TestSaveAndLoadTombstone(t *testing.T) {
|
||||||
|
profile_path := "test_profiles/TestTweetQueries"
|
||||||
|
profile := create_or_load_profile(profile_path)
|
||||||
|
|
||||||
|
tweet := create_dummy_tombstone()
|
||||||
|
|
||||||
|
// Save the tweet
|
||||||
|
err := profile.SaveTweet(tweet)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to save the tweet: %s", err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reload the tweet
|
||||||
|
new_tweet, err := profile.GetTweetById(tweet.ID)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to load the tweet: %s", err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
if diff := deep.Equal(tweet, new_tweet); diff != nil {
|
||||||
|
t.Error(diff)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Saving a tweet that already exists shouldn't reduce its backed-up status.
|
||||||
|
* i.e., content which is already saved shouldn't be marked un-saved if it's removed from Twitter.
|
||||||
|
* After all, that's the whole point of archiving.
|
||||||
|
*
|
||||||
|
* - is_stub should only go from "yes" to "no"
|
||||||
|
* - is_content_downloaded should only go from "no" to "yes"
|
||||||
|
*/
|
||||||
|
func TestNoWorseningTweet(t *testing.T) {
|
||||||
|
profile_path := "test_profiles/TestTweetQueries"
|
||||||
|
profile := create_or_load_profile(profile_path)
|
||||||
|
|
||||||
|
tweet := create_dummy_tweet()
|
||||||
|
tweet.IsContentDownloaded = true
|
||||||
|
tweet.IsStub = false
|
||||||
|
|
||||||
|
// Save the tweet
|
||||||
|
err := profile.SaveTweet(tweet)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to save the tweet: %s", err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Worsen the tweet and re-save it
|
||||||
|
tweet.IsContentDownloaded = false
|
||||||
|
tweet.IsStub = true
|
||||||
|
err = profile.SaveTweet(tweet)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to save the tweet: %s", err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reload the tweet
|
||||||
|
new_tweet, err := profile.GetTweetById(tweet.ID)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to load the tweet: %s", err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
if new_tweet.IsStub != false {
|
||||||
|
t.Errorf("Should have preserved non-stub status")
|
||||||
|
}
|
||||||
|
if new_tweet.IsContentDownloaded != true {
|
||||||
|
t.Errorf("Should have preserved is-content-downloaded status")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Should correctly report whether the User exists in the database
|
* Should correctly report whether the User exists in the database
|
||||||
*/
|
*/
|
||||||
|
@ -225,6 +225,24 @@ func create_dummy_tweet() scraper.Tweet {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a random tombstone
|
||||||
|
*/
|
||||||
|
func create_dummy_tombstone() scraper.Tweet {
|
||||||
|
rand.Seed(time.Now().UnixNano())
|
||||||
|
tweet_id := scraper.TweetID(rand.Int())
|
||||||
|
|
||||||
|
return scraper.Tweet{
|
||||||
|
ID: tweet_id,
|
||||||
|
UserID: -1,
|
||||||
|
TombstoneType: "deleted",
|
||||||
|
IsStub: true,
|
||||||
|
Mentions: []scraper.UserHandle{},
|
||||||
|
ReplyMentions: []scraper.UserHandle{},
|
||||||
|
Hashtags: []string{},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new retweet with a random ID for a given TweetID
|
* Create a new retweet with a random ID for a given TweetID
|
||||||
*/
|
*/
|
||||||
|
@ -32,6 +32,9 @@ type Tweet struct {
|
|||||||
Hashtags []string
|
Hashtags []string
|
||||||
QuotedTweet TweetID
|
QuotedTweet TweetID
|
||||||
|
|
||||||
|
TombstoneType string
|
||||||
|
IsStub bool
|
||||||
|
|
||||||
IsContentDownloaded bool
|
IsContentDownloaded bool
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -81,10 +84,13 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
|||||||
ret.UserID = UserID(apiTweet.UserID)
|
ret.UserID = UserID(apiTweet.UserID)
|
||||||
ret.Text = apiTweet.FullText
|
ret.Text = apiTweet.FullText
|
||||||
|
|
||||||
|
if apiTweet.TombstoneText == "" { // Skip time parsing for tombstones
|
||||||
ret.PostedAt, err = time.Parse(time.RubyDate, apiTweet.CreatedAt)
|
ret.PostedAt, err = time.Parse(time.RubyDate, apiTweet.CreatedAt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ret.NumLikes = apiTweet.FavoriteCount
|
ret.NumLikes = apiTweet.FavoriteCount
|
||||||
ret.NumRetweets = apiTweet.RetweetCount
|
ret.NumRetweets = apiTweet.RetweetCount
|
||||||
ret.NumReplies = apiTweet.ReplyCount
|
ret.NumReplies = apiTweet.ReplyCount
|
||||||
@ -139,6 +145,10 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
|||||||
ret.Videos = []Video{new_video}
|
ret.Videos = []Video{new_video}
|
||||||
ret.Images = []Image{}
|
ret.Images = []Image{}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ret.TombstoneType = apiTweet.TombstoneText
|
||||||
|
ret.IsStub = !(ret.TombstoneType == "")
|
||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -190,8 +200,20 @@ func GetTweetFull(id TweetID) (tweets []Tweet, retweets []Retweet, users []User,
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
tombstone_users := tweet_response.HandleTombstones()
|
||||||
return ParseTweetResponse(tweet_response)
|
fmt.Printf("%v\n", tombstone_users)
|
||||||
|
for _, u := range tombstone_users {
|
||||||
|
fetched_user, err1 := GetUser(UserHandle(u))
|
||||||
|
if err != nil {
|
||||||
|
err = err1
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fmt.Println(fetched_user)
|
||||||
|
users = append(users, fetched_user)
|
||||||
|
}
|
||||||
|
tweets, retweets, _users, err := ParseTweetResponse(tweet_response)
|
||||||
|
users = append(users, _users...)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -209,9 +209,40 @@ func TestParseTweetResponse(t *testing.T) {
|
|||||||
t.Errorf("Expected %d tweets, got %d", 29-3, len(tweets))
|
t.Errorf("Expected %d tweets, got %d", 29-3, len(tweets))
|
||||||
}
|
}
|
||||||
if len(retweets) != 3 {
|
if len(retweets) != 3 {
|
||||||
t.Errorf("Expected %d tweets, got %d", 3, len(retweets))
|
t.Errorf("Expected %d retweets, got %d", 3, len(retweets))
|
||||||
}
|
}
|
||||||
if len(users) != 9 {
|
if len(users) != 9 {
|
||||||
t.Errorf("Expected %d tweets, got %d", 9, len(users))
|
t.Errorf("Expected %d users, got %d", 9, len(users))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseTweetResponseWithTombstones(t *testing.T) {
|
||||||
|
data, err := ioutil.ReadFile("test_responses/tombstones/tombstone_deleted.json")
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
var tweet_resp scraper.TweetResponse
|
||||||
|
err = json.Unmarshal(data, &tweet_resp)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf(err.Error())
|
||||||
|
}
|
||||||
|
extra_users := tweet_resp.HandleTombstones()
|
||||||
|
if len(extra_users) != 1 {
|
||||||
|
t.Errorf("Expected to need 1 extra user but got %d instead", len(extra_users))
|
||||||
|
}
|
||||||
|
|
||||||
|
tweets, retweets, users, err := scraper.ParseTweetResponse(tweet_resp)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(tweets) != 2 {
|
||||||
|
t.Errorf("Expected %d tweets, got %d", 2, len(tweets))
|
||||||
|
}
|
||||||
|
if len(retweets) != 0 {
|
||||||
|
t.Errorf("Expected %d retweets, got %d", 0, len(retweets))
|
||||||
|
}
|
||||||
|
if len(users) != 1 {
|
||||||
|
t.Errorf("Expected %d users, got %d", 1, len(users))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,8 +18,7 @@ func GetUserFeedFor(user_id UserID, min_tweets int) (tweets []Tweet, retweets []
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(tweet_response.GlobalObjects.Tweets) < min_tweets &&
|
if len(tweet_response.GlobalObjects.Tweets) < min_tweets && tweet_response.GetCursor() != "" {
|
||||||
tweet_response.GetCursor() != "" {
|
|
||||||
err = api.GetMoreTweetsFromFeed(user_id, &tweet_response, min_tweets)
|
err = api.GetMoreTweetsFromFeed(user_id, &tweet_response, min_tweets)
|
||||||
if err != nil && err != END_OF_FEED {
|
if err != nil && err != END_OF_FEED {
|
||||||
return
|
return
|
||||||
|
Loading…
x
Reference in New Issue
Block a user