
- BUGFIX: deleted / banned users will no longer have all their other info wiped - e.g., follower counts, profile image URLs, bio, etc.
354 lines
12 KiB
Go
354 lines
12 KiB
Go
package persistence
|
|
|
|
import (
|
|
"database/sql"
|
|
"errors"
|
|
"fmt"
|
|
"path"
|
|
|
|
"github.com/jmoiron/sqlx"
|
|
"github.com/mattn/go-sqlite3"
|
|
|
|
"gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
|
|
)
|
|
|
|
type ErrConflictingUserHandle struct {
|
|
ConflictingUserID scraper.UserID
|
|
}
|
|
|
|
func (e ErrConflictingUserHandle) Error() string {
|
|
return fmt.Sprintf("active user with given handle already exists (id: %d)", e.ConflictingUserID)
|
|
}
|
|
|
|
const USERS_ALL_SQL_FIELDS = `
|
|
id, display_name, handle, bio, following_count, followers_count, location, website, join_date, is_private, is_verified,
|
|
is_banned, is_deleted, profile_image_url, profile_image_local_path, banner_image_url, banner_image_local_path,
|
|
pinned_tweet_id, is_content_downloaded, is_followed`
|
|
|
|
// User save strategy:
|
|
//
|
|
// 1. Check if the user needs a fake ID; if so, assign one
|
|
// 2. Try to execute an update
|
|
// 2a. if the user is banned or deleted, don't overwrite other fields, blanking them
|
|
// 2b. if the user exists but `handle` conflicts with an active user, do conflict handling
|
|
// 3. If the user doesn't already exist, execute an insert. Do conflict handling if applicable
|
|
//
|
|
// Conflict handling:
|
|
//
|
|
// 1. Look up the ID of the user with conflicting handle
|
|
// 2. TODO: handle case where the previous user has a fake ID
|
|
// May have to rescrape that user's tweets to figure out if they're the same user or not.
|
|
// Strategy 1: assume they're the same users
|
|
// - Execute a full update on the old user, including their ID (we have a real ID for them now)
|
|
// - Update all the other tables (tweets, follows, lists, etc) with the new ID
|
|
// Strategy 2: assume they're different users
|
|
// - Mark the old user as deactivated and be done with it
|
|
// 3. Mark the old user as deactivated, eliminating the conflict
|
|
// 4. Re-save the new user
|
|
// 5. Return an ErrConflictingUserHandle, notifying the caller of the conflict
|
|
func (p Profile) SaveUser(u *scraper.User) error {
|
|
// First, check if the user needs a fake ID, and generate one if needed
|
|
if u.IsNeedingFakeID {
|
|
// User is fake; check if we already have them, in order to proceed
|
|
err := p.DB.QueryRow("select id from users_by_handle where lower(handle) = lower(?)", u.Handle).Scan(&u.ID)
|
|
if errors.Is(err, sql.ErrNoRows) {
|
|
// We need to continue-- create a new fake user
|
|
u.ID = p.NextFakeUserID()
|
|
} else if err == nil {
|
|
// We're done; a user exists with this handle already. No need to fake anything, and we have no new data
|
|
// to provide (since the ID is fake).
|
|
// ID has already been scanned into the User, for use by the caller.
|
|
return nil
|
|
} else {
|
|
// A real error occurred
|
|
panic(fmt.Errorf("Error checking for existence of fake user with handle %q:\n %w", u.Handle, err))
|
|
}
|
|
}
|
|
|
|
// Handler function to deal with UNIQUE constraint violations on `handle`.
|
|
//
|
|
// We know the UNIQUE violation must be on `handle`, because we checked for users with this ID
|
|
// above (`update` query).
|
|
handle_conflict := func() error {
|
|
var old_user scraper.User
|
|
err := p.DB.Get(&old_user,
|
|
`select id, is_id_fake from users where handle = ? and is_banned = 0 and is_deleted = 0`,
|
|
u.Handle,
|
|
)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
if old_user.IsIdFake {
|
|
panic("TODO: user with fake ID")
|
|
} else {
|
|
// 1. The being-saved user ID doesn't exist yet (or was previously inactive)
|
|
// 2. There's another user with the same handle who's currently considered active
|
|
// 3. Their ID is not fake.
|
|
// 4. The being-saved user is also understood to be active (otherwise a UNIQUE handle
|
|
// conflict wouldn't have occurred)
|
|
//
|
|
// Since we're saving an active user, the old user is presumably no longer active.
|
|
// They will need to be rescraped when posssible, to find out what's going on. For
|
|
// now, we will just mark them as deleted.
|
|
_, err := p.DB.Exec(`update users set is_deleted=1 where id = ?`, old_user.ID)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
// Now we can save our new user. Should succeed since the conflict is cleared:
|
|
err = p.SaveUser(u)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
// Notify caller of the duplicate for rescraping
|
|
return ErrConflictingUserHandle{ConflictingUserID: old_user.ID}
|
|
}
|
|
}
|
|
|
|
// Try to treat it like an `update` and see if it works
|
|
var result sql.Result
|
|
var err error
|
|
if u.IsBanned || u.IsDeleted {
|
|
// If user is banned or deleted, it's a stub, so don't update other fields
|
|
result, err = p.DB.NamedExec(`update users set is_deleted=:is_deleted, is_banned=:is_banned where id = :id`, u)
|
|
} else {
|
|
// This could be re-activating a previously deleted / banned user
|
|
result, err = p.DB.NamedExec(`
|
|
update users
|
|
set handle=:handle,
|
|
bio=:bio,
|
|
display_name=:display_name,
|
|
following_count=:following_count,
|
|
followers_count=:followers_count,
|
|
location=:location,
|
|
website=:website,
|
|
is_private=:is_private,
|
|
is_verified=:is_verified,
|
|
is_banned=:is_banned,
|
|
is_deleted=:is_deleted,
|
|
profile_image_url=:profile_image_url,
|
|
profile_image_local_path=:profile_image_local_path,
|
|
banner_image_url=:banner_image_url,
|
|
banner_image_local_path=:banner_image_local_path,
|
|
pinned_tweet_id=:pinned_tweet_id,
|
|
is_content_downloaded=(is_content_downloaded or :is_content_downloaded)
|
|
where id = :id
|
|
`, u)
|
|
}
|
|
if err != nil {
|
|
// Check for UNIQUE constraint violation on `handle` field
|
|
var sqliteErr sqlite3.Error
|
|
is_ok := errors.As(err, &sqliteErr)
|
|
if is_ok && sqliteErr.ExtendedCode == sqlite3.ErrConstraintUnique {
|
|
return handle_conflict()
|
|
} else {
|
|
// Unexpected error
|
|
return fmt.Errorf("Error executing SaveUser(%s):\n %w", u.Handle, err)
|
|
}
|
|
}
|
|
// If a row was updated, then the User already exists and was updated successfully; we're done
|
|
rows_affected, err := result.RowsAffected()
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
if rows_affected > 0 {
|
|
return nil
|
|
}
|
|
|
|
// It's a new user. Try to insert it:
|
|
_, err = p.DB.NamedExec(`
|
|
insert into users (id, display_name, handle, bio, following_count, followers_count, location, website, join_date, is_private,
|
|
is_verified, is_banned, is_deleted, profile_image_url, profile_image_local_path, banner_image_url,
|
|
banner_image_local_path, pinned_tweet_id, is_content_downloaded, is_id_fake)
|
|
values (:id, :display_name, :handle, :bio, :following_count, :followers_count, :location, :website, :join_date, :is_private,
|
|
:is_verified, :is_banned, :is_deleted, :profile_image_url, :profile_image_local_path, :banner_image_url,
|
|
:banner_image_local_path, :pinned_tweet_id, :is_content_downloaded, :is_id_fake)
|
|
`,
|
|
u,
|
|
)
|
|
if err == nil {
|
|
// It worked; user is inserted, we're done
|
|
return nil
|
|
}
|
|
|
|
// If execution reaches this point, then an error has occurred; err is not nil.
|
|
// Check if it's a UNIQUE CONSTRAINT FAILED:
|
|
var sqliteErr sqlite3.Error
|
|
is_ok := errors.As(err, &sqliteErr)
|
|
if is_ok && sqliteErr.ExtendedCode == sqlite3.ErrConstraintUnique { // Conflict detected
|
|
return handle_conflict()
|
|
} else {
|
|
// Some other error
|
|
return fmt.Errorf("Error executing SaveUser(%s):\n %w", u.Handle, err)
|
|
}
|
|
}
|
|
|
|
// Retrieve a User from the database, by handle.
|
|
//
|
|
// args:
|
|
// - handle: the user handle to search for
|
|
//
|
|
// returns:
|
|
// - the User, if it exists
|
|
func (p Profile) GetUserByHandle(handle scraper.UserHandle) (scraper.User, error) {
|
|
db := p.DB
|
|
|
|
var ret scraper.User
|
|
err := db.Get(&ret, `
|
|
select `+USERS_ALL_SQL_FIELDS+`
|
|
from users_by_handle
|
|
where lower(handle) = lower(?)
|
|
`, handle)
|
|
|
|
if errors.Is(err, sql.ErrNoRows) {
|
|
return ret, ErrNotInDatabase
|
|
}
|
|
return ret, nil
|
|
}
|
|
|
|
// Retrieve a User from the database, by user ID.
|
|
//
|
|
// args:
|
|
// - id: the user ID to search for
|
|
//
|
|
// returns:
|
|
// - the User, if it exists
|
|
func (p Profile) GetUserByID(id scraper.UserID) (scraper.User, error) {
|
|
db := p.DB
|
|
|
|
var ret scraper.User
|
|
|
|
err := db.Get(&ret, `
|
|
select `+USERS_ALL_SQL_FIELDS+`
|
|
from users
|
|
where id = ?
|
|
`, id)
|
|
if errors.Is(err, sql.ErrNoRows) {
|
|
return ret, ErrNotInDatabase
|
|
}
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
return ret, nil
|
|
}
|
|
|
|
// Returns `true` if content download is needed, `false` otherwise
|
|
//
|
|
// If the user is banned, returns false because downloading will be impossible.
|
|
//
|
|
// If:
|
|
// - the user isn't in the DB at all (first time scraping), OR
|
|
// - `is_content_downloaded` is false in the DB, OR
|
|
// - the banner / profile image URL has changed from what the DB has
|
|
// then it needs to be downloaded.
|
|
//
|
|
// The `user` object will always have `is_content_downloaded` = false on every scrape. This is
|
|
// why the No Worsening Principle is needed.
|
|
func (p Profile) CheckUserContentDownloadNeeded(user scraper.User) bool {
|
|
row := p.DB.QueryRow(`select is_content_downloaded, profile_image_url, banner_image_url from users where id = ?`, user.ID)
|
|
|
|
var is_content_downloaded bool
|
|
var profile_image_url string
|
|
var banner_image_url string
|
|
err := row.Scan(&is_content_downloaded, &profile_image_url, &banner_image_url)
|
|
if err != nil {
|
|
if errors.Is(err, sql.ErrNoRows) {
|
|
return true
|
|
} else {
|
|
panic(err)
|
|
}
|
|
}
|
|
|
|
if !is_content_downloaded {
|
|
return true
|
|
}
|
|
|
|
banner_path := p.get_banner_image_output_path(user)
|
|
if banner_path != "" && !file_exists(banner_path) {
|
|
return true
|
|
}
|
|
profile_path := p.get_profile_image_output_path(user)
|
|
return !file_exists(profile_path)
|
|
}
|
|
|
|
// Follow / unfollow a user. Update the given User object's IsFollowed field.
|
|
func (p Profile) SetUserFollowed(user *scraper.User, is_followed bool) {
|
|
result, err := p.DB.Exec("update users set is_followed = ? where id = ?", is_followed, user.ID)
|
|
if err != nil {
|
|
panic(fmt.Errorf("Error inserting user with handle %q:\n %w", user.Handle, err))
|
|
}
|
|
count, err := result.RowsAffected()
|
|
if err != nil {
|
|
panic(fmt.Errorf("Unknown error retrieving row count:\n %w", err))
|
|
}
|
|
if count != 1 {
|
|
panic(fmt.Errorf("User with handle %q not found", user.Handle))
|
|
}
|
|
user.IsFollowed = is_followed
|
|
}
|
|
|
|
func (p Profile) NextFakeUserID() scraper.UserID {
|
|
_, err := p.DB.Exec("update fake_user_sequence set latest_fake_id = latest_fake_id + 1")
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
var ret scraper.UserID
|
|
err = p.DB.QueryRow("select latest_fake_id from fake_user_sequence").Scan(&ret)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
return ret
|
|
}
|
|
|
|
// TODO: This is only used in checking whether the media downloader should get the big or small version of
|
|
// a profile image. That should be rewritten
|
|
func (p Profile) IsFollowing(user scraper.User) bool {
|
|
row := p.DB.QueryRow("select is_followed from users where id like ?", user.ID)
|
|
var ret bool
|
|
err := row.Scan(&ret)
|
|
if err != nil {
|
|
if errors.Is(err, sql.ErrNoRows) {
|
|
return false
|
|
}
|
|
panic(err) // A real error
|
|
}
|
|
return ret
|
|
}
|
|
|
|
// Utility function to compute the path to save banner image to
|
|
func (p Profile) get_banner_image_output_path(u scraper.User) string {
|
|
return path.Join(p.ProfileDir, "profile_images", u.BannerImageLocalPath)
|
|
}
|
|
|
|
// Utility function to compute the path to save profile image to
|
|
func (p Profile) get_profile_image_output_path(u scraper.User) string {
|
|
if u.ProfileImageUrl == "" {
|
|
return path.Join(p.ProfileDir, "profile_images", path.Base(scraper.DEFAULT_PROFILE_IMAGE_URL))
|
|
}
|
|
return path.Join(p.ProfileDir, "profile_images", u.ProfileImageLocalPath)
|
|
}
|
|
|
|
// Do a text search for users
|
|
func (p Profile) SearchUsers(s string) []scraper.User {
|
|
var ret []scraper.User
|
|
q, args, err := sqlx.Named(`
|
|
select `+USERS_ALL_SQL_FIELDS+`
|
|
from users
|
|
where handle like :val
|
|
or display_name like :val
|
|
order by handle like :val or display_name like :val desc,
|
|
followers_count desc
|
|
`,
|
|
struct {
|
|
Val string `db:"val"`
|
|
}{fmt.Sprintf("%%%s%%", s)},
|
|
)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
err = p.DB.Select(&ret, q, args...)
|
|
if err != nil && !errors.Is(err, sql.ErrNoRows) {
|
|
panic(err)
|
|
}
|
|
return ret
|
|
}
|