From 79033cfc79e2d23818a89e114c32b2bc373dc4e0 Mon Sep 17 00:00:00 2001 From: Alessio Date: Tue, 17 Sep 2024 18:30:11 -0700 Subject: [PATCH] Make user handles only unique among active users --- cmd/tests.sh | 2 +- pkg/persistence/compound_ssf_queries.go | 12 ++--- pkg/persistence/schema.sql | 25 ++++++++- pkg/persistence/user_queries.go | 5 +- pkg/persistence/versions.go | 70 ++++++++++++++++++++++++- 5 files changed, 103 insertions(+), 11 deletions(-) diff --git a/cmd/tests.sh b/cmd/tests.sh index 59aea0c..a1c105f 100755 --- a/cmd/tests.sh +++ b/cmd/tests.sh @@ -360,7 +360,7 @@ tw fetch_tweet_only https://twitter.com/PandasAndVidya/status/156271472796842803 test "$(sqlite3 twitter.db "select count(*) from tweets where id = 156271472796842803")" == "0" # Test that you can pass a session with the `.session` file extension too -tw --session Offline_Twatter.session list_followed > /dev/null # Dummy operation +tw --session Offline_Twatter.session fetch_user Offline_Twatter # Dummy operation # Test search diff --git a/pkg/persistence/compound_ssf_queries.go b/pkg/persistence/compound_ssf_queries.go index 3207e78..b84d4ca 100644 --- a/pkg/persistence/compound_ssf_queries.go +++ b/pkg/persistence/compound_ssf_queries.go @@ -425,7 +425,7 @@ func (p Profile) NextPage(c Cursor, current_user_id scraper.UserID) (Feed, error // From, to, by, and RT'd by user handles if c.FromUserHandle != "" { - where_clauses = append(where_clauses, "tweets.user_id = (select id from users where handle like ?)") + where_clauses = append(where_clauses, "tweets.user_id = (select id from users_by_handle where handle like ?)") bind_values = append(bind_values, c.FromUserHandle) } for _, to_user := range c.ToUserHandles { @@ -433,11 +433,11 @@ func (p Profile) NextPage(c Cursor, current_user_id scraper.UserID) (Feed, error bind_values = append(bind_values, fmt.Sprintf("%%%s%%", to_user)) } if c.RetweetedByUserHandle != "" { - where_clauses = append(where_clauses, "retweeted_by = (select id from users where handle like ?)") + where_clauses = append(where_clauses, "retweeted_by = (select id from users_by_handle where handle like ?)") bind_values = append(bind_values, c.RetweetedByUserHandle) } if c.ByUserHandle != "" { - where_clauses = append(where_clauses, "by_user_id = (select id from users where handle like ?)") + where_clauses = append(where_clauses, "by_user_id = (select id from users_by_handle where handle like ?)") bind_values = append(bind_values, c.ByUserHandle) } if c.ListID != 0 { @@ -446,7 +446,7 @@ func (p Profile) NextPage(c Cursor, current_user_id scraper.UserID) (Feed, error } if c.FollowedByUserHandle != "" { where_clauses = append(where_clauses, - "by_user_id in (select followee_id from follows where follower_id = (select id from users where handle like ?))") + "by_user_id in (select followee_id from follows where follower_id = (select id from users_by_handle where handle like ?))") bind_values = append(bind_values, c.FollowedByUserHandle) } @@ -532,7 +532,7 @@ func (p Profile) NextPage(c Cursor, current_user_id scraper.UserID) (Feed, error likes_sort_order_field := "" if c.LikedByUserHandle != "" { liked_by_filter_join_clause = " join likes filter_likes on tweets.id = filter_likes.tweet_id " - where_clauses = append(where_clauses, "filter_likes.user_id = (select id from users where handle like ?) ") + where_clauses = append(where_clauses, "filter_likes.user_id = (select id from users_by_handle where handle like ?) ") bind_values = append(bind_values, c.LikedByUserHandle) likes_sort_order_field = ", coalesce(filter_likes.sort_order, -1) likes_sort_order " @@ -545,7 +545,7 @@ func (p Profile) NextPage(c Cursor, current_user_id scraper.UserID) (Feed, error bookmarks_sort_order_field := "" if c.BookmarkedByUserHandle != "" { bookmarked_by_filter_join_clause = " join bookmarks filter_bookmarks on tweets.id = filter_bookmarks.tweet_id " - where_clauses = append(where_clauses, "filter_bookmarks.user_id = (select id from users where handle like ?) ") + where_clauses = append(where_clauses, "filter_bookmarks.user_id = (select id from users_by_handle where handle like ?) ") bind_values = append(bind_values, c.BookmarkedByUserHandle) bookmarks_sort_order_field = ", coalesce(filter_bookmarks.sort_order, -1) bookmarks_sort_order " diff --git a/pkg/persistence/schema.sql b/pkg/persistence/schema.sql index cb21e19..5531f58 100644 --- a/pkg/persistence/schema.sql +++ b/pkg/persistence/schema.sql @@ -7,7 +7,7 @@ PRAGMA foreign_keys = on; create table users (rowid integer primary key, id integer unique not null check(typeof(id) = 'integer'), display_name text not null, - handle text unique not null, + handle text not null, bio text, following_count integer, followers_count integer, @@ -28,6 +28,29 @@ create table users (rowid integer primary key, is_id_fake boolean default 0, is_content_downloaded boolean default 0 ); +create unique index index_active_users_handle_unique on users (handle) where is_banned = 0 and is_deleted = 0; + +create view users_by_handle as + with active_users as ( + select * from users where is_banned = 0 and is_deleted = 0 + ), inactive_users as ( + select * from users where is_banned = 1 or is_deleted = 1 + ), inactive_users_with_no_shadowing_active_user as ( + select * from inactive_users where not exists ( + -- Ensure no active user exists for this handle + select 1 from active_users where active_users.handle = inactive_users.handle + ) + ) + select * from users + where is_banned = 0 and is_deleted = 0 -- select active users directly + or users.id = ( + -- select the inactive user with the highest ID if no active user exists for the handle + select max(id) + from inactive_users_with_no_shadowing_active_user + where users.handle = inactive_users_with_no_shadowing_active_user.handle + ) + group by handle having id = max(id); + create table fake_user_sequence(latest_fake_id integer not null); insert into fake_user_sequence values(0x4000000000000000); diff --git a/pkg/persistence/user_queries.go b/pkg/persistence/user_queries.go index d2a50a3..85450e5 100644 --- a/pkg/persistence/user_queries.go +++ b/pkg/persistence/user_queries.go @@ -23,7 +23,8 @@ const USERS_ALL_SQL_FIELDS = ` // - u: the User func (p Profile) SaveUser(u *scraper.User) error { if u.IsNeedingFakeID { - err := p.DB.QueryRow("select id from users where lower(handle) = lower(?)", u.Handle).Scan(&u.ID) + // User is fake; check if we already have them, in order to proceed + err := p.DB.QueryRow("select id from users_by_handle where lower(handle) = lower(?)", u.Handle).Scan(&u.ID) if errors.Is(err, sql.ErrNoRows) { // We need to continue-- create a new fake user u.ID = p.NextFakeUserID() @@ -86,7 +87,7 @@ func (p Profile) GetUserByHandle(handle scraper.UserHandle) (scraper.User, error var ret scraper.User err := db.Get(&ret, ` select `+USERS_ALL_SQL_FIELDS+` - from users + from users_by_handle where lower(handle) = lower(?) `, handle) diff --git a/pkg/persistence/versions.go b/pkg/persistence/versions.go index 1620f68..6891079 100644 --- a/pkg/persistence/versions.go +++ b/pkg/persistence/versions.go @@ -21,8 +21,9 @@ Please upgrade this application to a newer version to use this profile. Or down ) } -// The Nth entry is the migration that moves you from version N to version N+1. +// Database starts at version 0. First migration brings us to version 1 var MIGRATIONS = []string{ + // Version 1 `create table polls (rowid integer primary key, id integer unique not null check(typeof(id) = 'integer'), tweet_id integer not null, @@ -297,6 +298,7 @@ var MIGRATIONS = []string{ foreign key(chat_message_id) references chat_messages(id) ); create index if not exists index_chat_message_urls_chat_message_id on chat_message_urls (chat_message_id);`, + // 30 `create table bookmarks(rowid integer primary key, sort_order integer not null, -- Can't be unique because "-1" is used as "unknown" value user_id integer not null, @@ -354,6 +356,72 @@ var MIGRATIONS = []string{ user_id not null references users(id), unique(notification_id, user_id) );`, + `pragma foreign_keys = OFF; + begin exclusive transaction; + create table users_new (rowid integer primary key, + id integer unique not null check(typeof(id) = 'integer'), + display_name text not null, + handle text not null, + bio text, + following_count integer, + followers_count integer, + location text, + website text, + join_date integer, + is_private boolean default 0, + is_verified boolean default 0, + is_banned boolean default 0, + is_deleted boolean default 0, + profile_image_url text, + profile_image_local_path text, + banner_image_url text, + banner_image_local_path text, + pinned_tweet_id integer check(typeof(pinned_tweet_id) = 'integer' or pinned_tweet_id = ''), + + is_followed boolean default 0, + is_id_fake boolean default 0, + is_content_downloaded boolean default 0 + ); + create unique index index_active_users_handle_unique on users_new (handle) where is_banned = 0 and is_deleted = 0; + + INSERT INTO users_new (rowid, id, display_name, handle, bio, following_count, followers_count, location, + website, join_date, is_private, is_verified, is_banned, is_deleted, profile_image_url, + profile_image_local_path, banner_image_url, banner_image_local_path, pinned_tweet_id, is_followed, + is_id_fake, is_content_downloaded) + SELECT rowid, id, display_name, handle, bio, following_count, followers_count, location, + website, join_date, is_private, is_verified, is_banned, is_deleted, profile_image_url, + profile_image_local_path, banner_image_url, banner_image_local_path, pinned_tweet_id, is_followed, + is_id_fake, is_content_downloaded + FROM users; + + drop table users; + alter table users_new rename to users; + + create view users_by_handle as + with active_users as ( + select * from users where is_banned = 0 and is_deleted = 0 + ), inactive_users as ( + select * from users where is_banned = 1 or is_deleted = 1 + ), inactive_users_with_no_shadowing_active_user as ( + select * from inactive_users where not exists ( + -- Ensure no active user exists for this handle + select 1 from active_users where active_users.handle = inactive_users.handle + ) + ) + select * from users + where is_banned = 0 and is_deleted = 0 -- select active users directly + or users.id = ( + -- select the inactive user with the highest ID if no active user exists for the handle + select max(id) + from inactive_users_with_no_shadowing_active_user + where users.handle = inactive_users_with_no_shadowing_active_user.handle + ) + group by handle having id = max(id); + + commit; + pragma foreign_keys = ON; + vacuum; + `, } var ENGINE_DATABASE_VERSION = len(MIGRATIONS)