diff --git a/cmd/tests.sh b/cmd/tests.sh index 8c50964..1a1f2e2 100755 --- a/cmd/tests.sh +++ b/cmd/tests.sh @@ -144,7 +144,7 @@ urls_count_after=$(sqlite3 twitter.db "select count(*) from urls") test $urls_count_after = $(($urls_count + 1)) test "$(sqlite3 twitter.db "select title from urls where tweet_id = 1428904664645394433")" = "Justice Department investigating Elon Musk's SpaceX following complaint of hiring discrimination" test $(sqlite3 twitter.db "select count(*) from urls where tweet_id = 1428904664645394433") = "1" -thumbnail_name=$(sqlite3 twitter.db "select thumbnail_remote_url from urls where tweet_id = 1428904664645394433" | grep -Po "(?<=/)\w+(?=\?)") +thumbnail_name=$(sqlite3 twitter.db "select thumbnail_remote_url from urls where tweet_id = 1428904664645394433" | grep -Po "(?<=/)[\w-]+(?=\?)") test -n "$thumbnail_name" # Not testing for what the thumbnail url is because it keeps changing # Try to double-fetch it; shouldn't duplicate the URL @@ -174,6 +174,12 @@ test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id test $(find link_preview_images | wc -l) = $initial_link_preview_images_count # Should be the same +# Test a tweet thread with tombstones +tw fetch_tweet https://twitter.com/CovfefeAnon/status/1454526270809726977 +test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454515503242829830") = 1 +test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454521424144654344") = 0 +test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454522147750260742") = 1 + # TODO: Maybe this file should be broken up into multiple test scripts diff --git a/persistence/schema.sql b/persistence/schema.sql index 13592ce..916b17f 100644 --- a/persistence/schema.sql +++ b/persistence/schema.sql @@ -19,10 +19,18 @@ create table users (rowid integer primary key, pinned_tweet_id integer check(typeof(pinned_tweet_id) = 'integer' or pinned_tweet_id = ''), is_content_downloaded boolean default 0 - - -- foreign key(pinned_tweet_id) references tweets(id) ); +create table tombstone_types (rowid integer primary key, + short_name text not null unique, + tombstone_text text not null unique +); +insert into tombstone_types(rowid, short_name, tombstone_text) values + (1, 'deleted', 'This Tweet was deleted by the Tweet author'), + (2, 'suspended', '???'), + (3, 'hidden', 'You’re unable to view this Tweet because this account owner limits who can view their Tweets'), + (4, 'unavailable', 'This Tweet is unavailable'); + create table tweets (rowid integer primary key, id integer unique not null check(typeof(id) = 'integer'), user_id integer not null check(typeof(user_id) = 'integer'), @@ -37,11 +45,11 @@ create table tweets (rowid integer primary key, mentions text, -- comma-separated reply_mentions text, -- comma-separated hashtags text, -- comma-separated + tombstone_type integer default 0, + is_stub boolean default 0, is_content_downloaded boolean default 0, foreign key(user_id) references users(id) - -- foreign key(in_reply_to) references tweets(id), - -- foreign key(quoted_tweet) references tweets(id) ); create table retweets(rowid integer primary key, @@ -71,8 +79,6 @@ create table urls (rowid integer primary key, unique (tweet_id, text) foreign key(tweet_id) references tweets(id) - -- foreign key(creator_id) references users(id) - -- foreign key(site_id) references users(id) ); create table images (rowid integer primary key, diff --git a/persistence/tweet_queries.go b/persistence/tweet_queries.go index 0851ef5..8645eb3 100644 --- a/persistence/tweet_queries.go +++ b/persistence/tweet_queries.go @@ -16,17 +16,18 @@ func (p Profile) SaveTweet(t scraper.Tweet) error { return err } _, err = db.Exec(` - insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, is_content_downloaded) - values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, tombstone_type, is_stub, is_content_downloaded) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, (select rowid from tombstone_types where short_name=?), ?, ?) on conflict do update set num_likes=?, num_retweets=?, num_replies=?, num_quote_tweets=?, + is_stub=(is_stub and ?), is_content_downloaded=(is_content_downloaded or ?) `, - t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyTo, t.QuotedTweet, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions), strings.Join(t.Hashtags, ","), t.IsContentDownloaded, - t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsContentDownloaded, + t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyTo, t.QuotedTweet, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions), strings.Join(t.Hashtags, ","), t.TombstoneType, t.IsStub, t.IsContentDownloaded, + t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsStub, t.IsContentDownloaded, ) if err != nil { @@ -83,8 +84,8 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) { db := p.DB stmt, err := db.Prepare(` - select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, is_content_downloaded - from tweets + select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, ifnull(tombstone_types.short_name, ""), is_stub, is_content_downloaded + from tweets left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid where id = ? `) @@ -100,19 +101,30 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) { var hashtags string row := stmt.QueryRow(id) - err = row.Scan(&t.ID, &t.UserID, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyTo, &t.QuotedTweet, &mentions, &reply_mentions, &hashtags, &t.IsContentDownloaded) + err = row.Scan(&t.ID, &t.UserID, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyTo, &t.QuotedTweet, &mentions, &reply_mentions, &hashtags, &t.TombstoneType, &t.IsStub, &t.IsContentDownloaded) if err != nil { return t, err } t.PostedAt = time.Unix(int64(postedAt), 0) // args are `seconds` and `nanoseconds` + t.Mentions = []scraper.UserHandle{} for _, m := range strings.Split(mentions, ",") { - t.Mentions = append(t.Mentions, scraper.UserHandle(m)) + if m != "" { + t.Mentions = append(t.Mentions, scraper.UserHandle(m)) + } } + t.ReplyMentions = []scraper.UserHandle{} for _, m := range strings.Split(reply_mentions, ",") { - t.ReplyMentions = append(t.ReplyMentions, scraper.UserHandle(m)) + if m != "" { + t.ReplyMentions = append(t.ReplyMentions, scraper.UserHandle(m)) + } + } + t.Hashtags = []string{} + for _, h := range strings.Split(hashtags, ",") { + if h != "" { + t.Hashtags = append(t.Hashtags, h) + } } - t.Hashtags = strings.Split(hashtags, ",") imgs, err := p.GetImagesForTweet(t) if err != nil { diff --git a/persistence/tweet_queries_test.go b/persistence/tweet_queries_test.go index eb44df5..81c4db6 100644 --- a/persistence/tweet_queries_test.go +++ b/persistence/tweet_queries_test.go @@ -34,6 +34,76 @@ func TestSaveAndLoadTweet(t *testing.T) { } } +/** + * Same as above, but with a tombstone + */ +func TestSaveAndLoadTombstone(t *testing.T) { + profile_path := "test_profiles/TestTweetQueries" + profile := create_or_load_profile(profile_path) + + tweet := create_dummy_tombstone() + + // Save the tweet + err := profile.SaveTweet(tweet) + if err != nil { + t.Fatalf("Failed to save the tweet: %s", err.Error()) + } + + // Reload the tweet + new_tweet, err := profile.GetTweetById(tweet.ID) + if err != nil { + t.Fatalf("Failed to load the tweet: %s", err.Error()) + } + + if diff := deep.Equal(tweet, new_tweet); diff != nil { + t.Error(diff) + } +} + +/** + * Saving a tweet that already exists shouldn't reduce its backed-up status. + * i.e., content which is already saved shouldn't be marked un-saved if it's removed from Twitter. + * After all, that's the whole point of archiving. + * + * - is_stub should only go from "yes" to "no" + * - is_content_downloaded should only go from "no" to "yes" + */ +func TestNoWorseningTweet(t *testing.T) { + profile_path := "test_profiles/TestTweetQueries" + profile := create_or_load_profile(profile_path) + + tweet := create_dummy_tweet() + tweet.IsContentDownloaded = true + tweet.IsStub = false + + // Save the tweet + err := profile.SaveTweet(tweet) + if err != nil { + t.Fatalf("Failed to save the tweet: %s", err.Error()) + } + + // Worsen the tweet and re-save it + tweet.IsContentDownloaded = false + tweet.IsStub = true + err = profile.SaveTweet(tweet) + if err != nil { + t.Fatalf("Failed to save the tweet: %s", err.Error()) + } + + // Reload the tweet + new_tweet, err := profile.GetTweetById(tweet.ID) + if err != nil { + t.Fatalf("Failed to load the tweet: %s", err.Error()) + } + + if new_tweet.IsStub != false { + t.Errorf("Should have preserved non-stub status") + } + if new_tweet.IsContentDownloaded != true { + t.Errorf("Should have preserved is-content-downloaded status") + } +} + /** * Should correctly report whether the User exists in the database */ diff --git a/persistence/utils_test.go b/persistence/utils_test.go index 02bce59..b3fe82f 100644 --- a/persistence/utils_test.go +++ b/persistence/utils_test.go @@ -225,6 +225,24 @@ func create_dummy_tweet() scraper.Tweet { } } +/** + * Create a random tombstone + */ +func create_dummy_tombstone() scraper.Tweet { + rand.Seed(time.Now().UnixNano()) + tweet_id := scraper.TweetID(rand.Int()) + + return scraper.Tweet{ + ID: tweet_id, + UserID: -1, + TombstoneType: "deleted", + IsStub: true, + Mentions: []scraper.UserHandle{}, + ReplyMentions: []scraper.UserHandle{}, + Hashtags: []string{}, + } +} + /** * Create a new retweet with a random ID for a given TweetID */ diff --git a/scraper/tweet.go b/scraper/tweet.go index d202afc..16b27dd 100644 --- a/scraper/tweet.go +++ b/scraper/tweet.go @@ -32,6 +32,9 @@ type Tweet struct { Hashtags []string QuotedTweet TweetID + TombstoneType string + IsStub bool + IsContentDownloaded bool } @@ -81,10 +84,13 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { ret.UserID = UserID(apiTweet.UserID) ret.Text = apiTweet.FullText - ret.PostedAt, err = time.Parse(time.RubyDate, apiTweet.CreatedAt) - if err != nil { - return + if apiTweet.TombstoneText == "" { // Skip time parsing for tombstones + ret.PostedAt, err = time.Parse(time.RubyDate, apiTweet.CreatedAt) + if err != nil { + return + } } + ret.NumLikes = apiTweet.FavoriteCount ret.NumRetweets = apiTweet.RetweetCount ret.NumReplies = apiTweet.ReplyCount @@ -139,6 +145,10 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { ret.Videos = []Video{new_video} ret.Images = []Image{} } + + ret.TombstoneType = apiTweet.TombstoneText + ret.IsStub = !(ret.TombstoneType == "") + return } @@ -190,8 +200,20 @@ func GetTweetFull(id TweetID) (tweets []Tweet, retweets []Retweet, users []User, return } } - - return ParseTweetResponse(tweet_response) + tombstone_users := tweet_response.HandleTombstones() + fmt.Printf("%v\n", tombstone_users) + for _, u := range tombstone_users { + fetched_user, err1 := GetUser(UserHandle(u)) + if err != nil { + err = err1 + return + } + fmt.Println(fetched_user) + users = append(users, fetched_user) + } + tweets, retweets, _users, err := ParseTweetResponse(tweet_response) + users = append(users, _users...) + return } /** diff --git a/scraper/tweet_test.go b/scraper/tweet_test.go index d032b07..63c7f90 100644 --- a/scraper/tweet_test.go +++ b/scraper/tweet_test.go @@ -209,9 +209,40 @@ func TestParseTweetResponse(t *testing.T) { t.Errorf("Expected %d tweets, got %d", 29-3, len(tweets)) } if len(retweets) != 3 { - t.Errorf("Expected %d tweets, got %d", 3, len(retweets)) + t.Errorf("Expected %d retweets, got %d", 3, len(retweets)) } if len(users) != 9 { - t.Errorf("Expected %d tweets, got %d", 9, len(users)) + t.Errorf("Expected %d users, got %d", 9, len(users)) + } +} + +func TestParseTweetResponseWithTombstones(t *testing.T) { + data, err := ioutil.ReadFile("test_responses/tombstones/tombstone_deleted.json") + if err != nil { + panic(err) + } + var tweet_resp scraper.TweetResponse + err = json.Unmarshal(data, &tweet_resp) + if err != nil { + t.Errorf(err.Error()) + } + extra_users := tweet_resp.HandleTombstones() + if len(extra_users) != 1 { + t.Errorf("Expected to need 1 extra user but got %d instead", len(extra_users)) + } + + tweets, retweets, users, err := scraper.ParseTweetResponse(tweet_resp) + if err != nil { + t.Fatal(err) + } + + if len(tweets) != 2 { + t.Errorf("Expected %d tweets, got %d", 2, len(tweets)) + } + if len(retweets) != 0 { + t.Errorf("Expected %d retweets, got %d", 0, len(retweets)) + } + if len(users) != 1 { + t.Errorf("Expected %d users, got %d", 1, len(users)) } } diff --git a/scraper/user_feed.go b/scraper/user_feed.go index aec24b4..e30ce87 100644 --- a/scraper/user_feed.go +++ b/scraper/user_feed.go @@ -18,8 +18,7 @@ func GetUserFeedFor(user_id UserID, min_tweets int) (tweets []Tweet, retweets [] return } - if len(tweet_response.GlobalObjects.Tweets) < min_tweets && - tweet_response.GetCursor() != "" { + if len(tweet_response.GlobalObjects.Tweets) < min_tweets && tweet_response.GetCursor() != "" { err = api.GetMoreTweetsFromFeed(user_id, &tweet_response, min_tweets) if err != nil && err != END_OF_FEED { return