Persist tombstone data

2021-11-06 13:37:46 -07:00 · 2021-11-06 13:37:46 -07:00 · 470dce1d27
commit 470dce1d27
parent e5b4b43358
8 changed files with 190 additions and 26 deletions
--- a/cmd/tests.sh
+++ b/cmd/tests.sh
@ -144,7 +144,7 @@ urls_count_after=$(sqlite3 twitter.db "select count(*) from urls")
 test $urls_count_after = $(($urls_count + 1))
 test "$(sqlite3 twitter.db "select title from urls where tweet_id = 1428904664645394433")" = "Justice Department investigating Elon Musk's SpaceX following complaint of hiring discrimination"
 test $(sqlite3 twitter.db "select count(*) from urls where tweet_id = 1428904664645394433") = "1"
-thumbnail_name=$(sqlite3 twitter.db "select thumbnail_remote_url from urls where tweet_id = 1428904664645394433" | grep -Po "(?<=/)\w+(?=\?)")
+thumbnail_name=$(sqlite3 twitter.db "select thumbnail_remote_url from urls where tweet_id = 1428904664645394433" | grep -Po "(?<=/)[\w-]+(?=\?)")
 test -n "$thumbnail_name"  # Not testing for what the thumbnail url is because it keeps changing
 # Try to double-fetch it; shouldn't duplicate the URL
@ -174,6 +174,12 @@ test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id
 test $(find link_preview_images | wc -l) = $initial_link_preview_images_count  # Should be the same
 # Test a tweet thread with tombstones
 tw fetch_tweet https://twitter.com/CovfefeAnon/status/1454526270809726977
 test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454515503242829830") = 1
 test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454521424144654344") = 0
 test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454522147750260742") = 1
 # TODO: Maybe this file should be broken up into multiple test scripts
--- a/persistence/schema.sql
+++ b/persistence/schema.sql
@ -19,10 +19,18 @@ create table users (rowid integer primary key,
    pinned_tweet_id integer check(typeof(pinned_tweet_id) = 'integer' or pinned_tweet_id = ''),
    is_content_downloaded boolean default 0
    -- foreign key(pinned_tweet_id) references tweets(id)
 );
 create table tombstone_types (rowid integer primary key,
    short_name text not null unique,
    tombstone_text text not null unique
 );
 insert into tombstone_types(rowid, short_name, tombstone_text) values
    (1, 'deleted', 'This Tweet was deleted by the Tweet author'),
    (2, 'suspended', '???'),
    (3, 'hidden', 'You’re unable to view this Tweet because this account owner limits who can view their Tweets'),
    (4, 'unavailable', 'This Tweet is unavailable');
 create table tweets (rowid integer primary key,
    id integer unique not null check(typeof(id) = 'integer'),
    user_id integer not null check(typeof(user_id) = 'integer'),
@ -37,11 +45,11 @@ create table tweets (rowid integer primary key,
    mentions text,        -- comma-separated
    reply_mentions text,  -- comma-separated
    hashtags text,        -- comma-separated
    tombstone_type integer default 0,
    is_stub boolean default 0,
    is_content_downloaded boolean default 0,
    foreign key(user_id) references users(id)
    -- foreign key(in_reply_to) references tweets(id),
    -- foreign key(quoted_tweet) references tweets(id)
 );
 create table retweets(rowid integer primary key,
@ -71,8 +79,6 @@ create table urls (rowid integer primary key,
    unique (tweet_id, text)
    foreign key(tweet_id) references tweets(id)
    -- foreign key(creator_id) references users(id)
    -- foreign key(site_id) references users(id)
 );
 create table images (rowid integer primary key,
--- a/persistence/tweet_queries.go
+++ b/persistence/tweet_queries.go
@ -16,17 +16,18 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
        return err
    }
    _, err = db.Exec(`
-        insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, is_content_downloaded)
+        insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, tombstone_type, is_stub, is_content_downloaded)
-        values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, (select rowid from tombstone_types where short_name=?), ?, ?)
            on conflict do update
           set num_likes=?,
               num_retweets=?,
               num_replies=?,
               num_quote_tweets=?,
               is_stub=(is_stub and ?),
               is_content_downloaded=(is_content_downloaded or ?)
        `,
-        t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyTo, t.QuotedTweet, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions), strings.Join(t.Hashtags, ","), t.IsContentDownloaded,
+        t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyTo, t.QuotedTweet, scraper.JoinArrayOfHandles(t.Mentions), scraper.JoinArrayOfHandles(t.ReplyMentions), strings.Join(t.Hashtags, ","), t.TombstoneType, t.IsStub, t.IsContentDownloaded,
-        t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsContentDownloaded,
+        t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsStub, t.IsContentDownloaded,
    )
    if err != nil {
@ -83,8 +84,8 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
    db := p.DB
    stmt, err := db.Prepare(`
-        select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, is_content_downloaded
+        select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, reply_mentions, hashtags, ifnull(tombstone_types.short_name, ""), is_stub, is_content_downloaded
-          from tweets
+          from tweets left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid
         where id = ?
    `)
@ -100,19 +101,30 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
    var hashtags string
    row := stmt.QueryRow(id)
-    err = row.Scan(&t.ID, &t.UserID, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyTo, &t.QuotedTweet, &mentions, &reply_mentions, &hashtags, &t.IsContentDownloaded)
+    err = row.Scan(&t.ID, &t.UserID, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyTo, &t.QuotedTweet, &mentions, &reply_mentions, &hashtags, &t.TombstoneType, &t.IsStub, &t.IsContentDownloaded)
    if err != nil {
        return t, err
    }
    t.PostedAt = time.Unix(int64(postedAt), 0)  // args are `seconds` and `nanoseconds`
    t.Mentions = []scraper.UserHandle{}
    for _, m := range strings.Split(mentions, ",") {
        if m != "" {
            t.Mentions = append(t.Mentions, scraper.UserHandle(m))
        }
    }
    t.ReplyMentions = []scraper.UserHandle{}
    for _, m := range strings.Split(reply_mentions, ",") {
        if m != "" {
            t.ReplyMentions = append(t.ReplyMentions, scraper.UserHandle(m))
        }
-    t.Hashtags = strings.Split(hashtags, ",")
+    }
    t.Hashtags = []string{}
    for _, h := range strings.Split(hashtags, ",") {
        if h != "" {
            t.Hashtags =  append(t.Hashtags, h)
        }
    }
    imgs, err := p.GetImagesForTweet(t)
    if err != nil {
--- a/persistence/tweet_queries_test.go
+++ b/persistence/tweet_queries_test.go
@ -34,6 +34,76 @@ func TestSaveAndLoadTweet(t *testing.T) {
    }
 }
 /**
 * Same as above, but with a tombstone
 */
 func TestSaveAndLoadTombstone(t *testing.T) {
    profile_path := "test_profiles/TestTweetQueries"
    profile := create_or_load_profile(profile_path)
    tweet := create_dummy_tombstone()
    // Save the tweet
    err := profile.SaveTweet(tweet)
    if err != nil {
        t.Fatalf("Failed to save the tweet: %s", err.Error())
    }
    // Reload the tweet
    new_tweet, err := profile.GetTweetById(tweet.ID)
    if err != nil {
        t.Fatalf("Failed to load the tweet: %s", err.Error())
    }
    if diff := deep.Equal(tweet, new_tweet); diff != nil {
        t.Error(diff)
    }
 }
 /**
 * Saving a tweet that already exists shouldn't reduce its backed-up status.
 * i.e., content which is already saved shouldn't be marked un-saved if it's removed from Twitter.
 * After all, that's the whole point of archiving.
 *
 * - is_stub should only go from "yes" to "no"
 * - is_content_downloaded should only go from "no" to "yes"
 */
 func TestNoWorseningTweet(t *testing.T) {
    profile_path := "test_profiles/TestTweetQueries"
    profile := create_or_load_profile(profile_path)
    tweet := create_dummy_tweet()
    tweet.IsContentDownloaded = true
    tweet.IsStub = false
    // Save the tweet
    err := profile.SaveTweet(tweet)
    if err != nil {
        t.Fatalf("Failed to save the tweet: %s", err.Error())
    }
    // Worsen the tweet and re-save it
    tweet.IsContentDownloaded = false
    tweet.IsStub = true
    err = profile.SaveTweet(tweet)
    if err != nil {
        t.Fatalf("Failed to save the tweet: %s", err.Error())
    }
    // Reload the tweet
    new_tweet, err := profile.GetTweetById(tweet.ID)
    if err != nil {
        t.Fatalf("Failed to load the tweet: %s", err.Error())
    }
    if new_tweet.IsStub != false {
        t.Errorf("Should have preserved non-stub status")
    }
    if new_tweet.IsContentDownloaded != true {
        t.Errorf("Should have preserved is-content-downloaded status")
    }
 }
 /**
 * Should correctly report whether the User exists in the database
 */
--- a/persistence/utils_test.go
+++ b/persistence/utils_test.go
@ -225,6 +225,24 @@ func create_dummy_tweet() scraper.Tweet {
 	}
 }
 /**
 * Create a random tombstone
 */
 func create_dummy_tombstone() scraper.Tweet {
 	rand.Seed(time.Now().UnixNano())
 	tweet_id := scraper.TweetID(rand.Int())
 	return scraper.Tweet{
 		ID: tweet_id,
 		UserID: -1,
 		TombstoneType: "deleted",
 		IsStub: true,
 		Mentions: []scraper.UserHandle{},
 		ReplyMentions: []scraper.UserHandle{},
 		Hashtags: []string{},
 	}
 }
 /**
 * Create a new retweet with a random ID for a given TweetID
 */
--- a/scraper/tweet.go
+++ b/scraper/tweet.go
@ -32,6 +32,9 @@ type Tweet struct {
 	Hashtags      []string
 	QuotedTweet   TweetID
 	TombstoneType string
 	IsStub bool
 	IsContentDownloaded bool
 }
@ -81,10 +84,13 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
 	ret.UserID = UserID(apiTweet.UserID)
 	ret.Text = apiTweet.FullText
 	if apiTweet.TombstoneText == "" {  // Skip time parsing for tombstones
 		ret.PostedAt, err = time.Parse(time.RubyDate, apiTweet.CreatedAt)
 		if err != nil {
 			return
 		}
 	}
 	ret.NumLikes = apiTweet.FavoriteCount
 	ret.NumRetweets = apiTweet.RetweetCount
 	ret.NumReplies = apiTweet.ReplyCount
@ -139,6 +145,10 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
 		ret.Videos = []Video{new_video}
 		ret.Images = []Image{}
 	}
 	ret.TombstoneType = apiTweet.TombstoneText
 	ret.IsStub = !(ret.TombstoneType == "")
 	return
 }
@ -190,8 +200,20 @@ func GetTweetFull(id TweetID) (tweets []Tweet, retweets []Retweet, users []User,
 			return
 		}
 	}
-
+	tombstone_users := tweet_response.HandleTombstones()
-	return ParseTweetResponse(tweet_response)
+	fmt.Printf("%v\n", tombstone_users)
 	for _, u := range tombstone_users {
 		fetched_user, err1 := GetUser(UserHandle(u))
 		if err != nil {
 			err = err1
 			return
 		}
 		fmt.Println(fetched_user)
 		users = append(users, fetched_user)
 	}
 	tweets, retweets, _users, err := ParseTweetResponse(tweet_response)
 	users = append(users, _users...)
 	return
 }
 /**
--- a/scraper/tweet_test.go
+++ b/scraper/tweet_test.go
@ -209,9 +209,40 @@ func TestParseTweetResponse(t *testing.T) {
 		t.Errorf("Expected %d tweets, got %d", 29-3, len(tweets))
 	}
 	if len(retweets) != 3 {
-		t.Errorf("Expected %d tweets, got %d", 3, len(retweets))
+		t.Errorf("Expected %d retweets, got %d", 3, len(retweets))
 	}
 	if len(users) != 9 {
-		t.Errorf("Expected %d tweets, got %d", 9, len(users))
+		t.Errorf("Expected %d users, got %d", 9, len(users))
 	}
 }
 func TestParseTweetResponseWithTombstones(t *testing.T) {
 	data, err := ioutil.ReadFile("test_responses/tombstones/tombstone_deleted.json")
 	if err != nil {
 		panic(err)
 	}
 	var tweet_resp scraper.TweetResponse
 	err = json.Unmarshal(data, &tweet_resp)
 	if err != nil {
 		t.Errorf(err.Error())
 	}
 	extra_users := tweet_resp.HandleTombstones()
 	if len(extra_users) != 1 {
 		t.Errorf("Expected to need 1 extra user but got %d instead", len(extra_users))
 	}
 	tweets, retweets, users, err := scraper.ParseTweetResponse(tweet_resp)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if len(tweets) != 2 {
 		t.Errorf("Expected %d tweets, got %d", 2, len(tweets))
 	}
 	if len(retweets) != 0 {
 		t.Errorf("Expected %d retweets, got %d", 0, len(retweets))
 	}
 	if len(users) != 1 {
 		t.Errorf("Expected %d users, got %d", 1, len(users))
 	}
 }
--- a/scraper/user_feed.go
+++ b/scraper/user_feed.go
@ -18,8 +18,7 @@ func GetUserFeedFor(user_id UserID, min_tweets int) (tweets []Tweet, retweets []
 		return
 	}
-	if len(tweet_response.GlobalObjects.Tweets) < min_tweets &&
+	if len(tweet_response.GlobalObjects.Tweets) < min_tweets && tweet_response.GetCursor() != "" {
 			tweet_response.GetCursor() != "" {
 		err = api.GetMoreTweetsFromFeed(user_id, &tweet_response, min_tweets)
 		if err != nil && err != END_OF_FEED {
 			return