REFACTOR: SaveTweetTrove no longer calls scraper.GetUserByID, removing another scraper dependency from persistence package

- it was calling it to handle conflicting user handles
- now user handle conflicts are handled by the callee
- Add a utility function in `main` and `webserver` packages to rescrape the conflicting users
This commit is contained in:
Alessio 2025-02-10 12:53:19 -08:00
parent fc66d1d8a6
commit 4132eb4bca
11 changed files with 96 additions and 63 deletions

View File

@ -96,3 +96,30 @@ func is_scrape_failure(err error) bool {
} }
return true return true
} }
// DUPE: full_save_tweet_trove
func full_save_tweet_trove(trove scraper.TweetTrove) {
conflicting_users := profile.SaveTweetTrove(trove, true, api.DownloadMedia)
for _, u_id := range conflicting_users {
fmt.Printf(terminal_utils.COLOR_YELLOW+
"Conflicting user handle found (ID %d); old user has been marked deleted. Rescraping manually"+
terminal_utils.COLOR_RESET+"\n",
u_id)
// Rescrape
updated_user, err := scraper.GetUserByID(u_id)
if errors.Is(err, scraper.ErrDoesntExist) {
// Mark them as deleted.
// Handle and display name won't be updated if the user exists.
updated_user = scraper.User{ID: u_id, DisplayName: "<Unknown User>", Handle: "<UNKNOWN USER>", IsDeleted: true}
} else if err != nil {
panic(fmt.Errorf("error scraping conflicting user (ID %d): %w", u_id, err))
}
err = profile.SaveUser(&updated_user)
if err != nil {
panic(fmt.Errorf(
"error saving rescraped conflicting user with ID %d and handle %q: %w",
updated_user.ID, updated_user.Handle, err,
))
}
}
}

View File

@ -410,7 +410,7 @@ func fetch_tweet_conversation(tweet_identifier string, how_many int) {
if is_scrape_failure(err) { if is_scrape_failure(err) {
die(err.Error(), false, -1) die(err.Error(), false, -1)
} }
profile.SaveTweetTrove(trove, true, api.DownloadMedia) full_save_tweet_trove(trove)
happy_exit(fmt.Sprintf("Saved %d tweets and %d users", len(trove.Tweets), len(trove.Users)), err) happy_exit(fmt.Sprintf("Saved %d tweets and %d users", len(trove.Tweets), len(trove.Users)), err)
} }
@ -431,7 +431,7 @@ func fetch_user_feed(handle string, how_many int) {
if is_scrape_failure(err) { if is_scrape_failure(err) {
die(fmt.Sprintf("Error scraping feed: %s\n %s", handle, err.Error()), false, -2) die(fmt.Sprintf("Error scraping feed: %s\n %s", handle, err.Error()), false, -2)
} }
profile.SaveTweetTrove(trove, true, api.DownloadMedia) full_save_tweet_trove(trove)
happy_exit( happy_exit(
fmt.Sprintf("Saved %d tweets, %d retweets and %d users", len(trove.Tweets), len(trove.Retweets), len(trove.Users)), fmt.Sprintf("Saved %d tweets, %d retweets and %d users", len(trove.Tweets), len(trove.Retweets), len(trove.Users)),
@ -449,7 +449,7 @@ func get_user_likes(handle string, how_many int) {
if is_scrape_failure(err) { if is_scrape_failure(err) {
die(fmt.Sprintf("Error scraping feed: %s\n %s", handle, err.Error()), false, -2) die(fmt.Sprintf("Error scraping feed: %s\n %s", handle, err.Error()), false, -2)
} }
profile.SaveTweetTrove(trove, true, api.DownloadMedia) full_save_tweet_trove(trove)
happy_exit( happy_exit(
fmt.Sprintf("Saved %d tweets, %d retweets and %d users", len(trove.Tweets), len(trove.Retweets), len(trove.Users)), fmt.Sprintf("Saved %d tweets, %d retweets and %d users", len(trove.Tweets), len(trove.Retweets), len(trove.Users)),
@ -467,7 +467,7 @@ func get_followees(handle string, how_many int) {
if is_scrape_failure(err) { if is_scrape_failure(err) {
die(fmt.Sprintf("Error getting followees: %s\n %s", handle, err.Error()), false, -2) die(fmt.Sprintf("Error getting followees: %s\n %s", handle, err.Error()), false, -2)
} }
profile.SaveTweetTrove(trove, true, api.DownloadMedia) full_save_tweet_trove(trove)
profile.SaveAsFolloweesList(user.ID, trove) profile.SaveAsFolloweesList(user.ID, trove)
happy_exit(fmt.Sprintf("Saved %d followees", len(trove.Users)), err) happy_exit(fmt.Sprintf("Saved %d followees", len(trove.Users)), err)
@ -481,7 +481,7 @@ func get_followers(handle string, how_many int) {
if is_scrape_failure(err) { if is_scrape_failure(err) {
die(fmt.Sprintf("Error getting followees: %s\n %s", handle, err.Error()), false, -2) die(fmt.Sprintf("Error getting followees: %s\n %s", handle, err.Error()), false, -2)
} }
profile.SaveTweetTrove(trove, true, api.DownloadMedia) full_save_tweet_trove(trove)
profile.SaveAsFollowersList(user.ID, trove) profile.SaveAsFollowersList(user.ID, trove)
happy_exit(fmt.Sprintf("Saved %d followers", len(trove.Users)), err) happy_exit(fmt.Sprintf("Saved %d followers", len(trove.Users)), err)
@ -491,7 +491,7 @@ func get_bookmarks(how_many int) {
if is_scrape_failure(err) { if is_scrape_failure(err) {
die(fmt.Sprintf("Error scraping bookmarks:\n %s", err.Error()), false, -2) die(fmt.Sprintf("Error scraping bookmarks:\n %s", err.Error()), false, -2)
} }
profile.SaveTweetTrove(trove, true, api.DownloadMedia) full_save_tweet_trove(trove)
happy_exit(fmt.Sprintf( happy_exit(fmt.Sprintf(
"Saved %d tweets, %d retweets, %d users, and %d bookmarks", "Saved %d tweets, %d retweets, %d users, and %d bookmarks",
@ -504,7 +504,7 @@ func fetch_timeline(is_following_only bool) {
if is_scrape_failure(err) { if is_scrape_failure(err) {
die(fmt.Sprintf("Error fetching timeline:\n %s", err.Error()), false, -2) die(fmt.Sprintf("Error fetching timeline:\n %s", err.Error()), false, -2)
} }
profile.SaveTweetTrove(trove, true, api.DownloadMedia) full_save_tweet_trove(trove)
happy_exit( happy_exit(
fmt.Sprintf("Saved %d tweets, %d retweets and %d users", len(trove.Tweets), len(trove.Retweets), len(trove.Users)), fmt.Sprintf("Saved %d tweets, %d retweets and %d users", len(trove.Tweets), len(trove.Retweets), len(trove.Users)),
@ -544,7 +544,7 @@ func search(query string, how_many int) {
if is_scrape_failure(err) { if is_scrape_failure(err) {
die(fmt.Sprintf("Error scraping search results: %s", err.Error()), false, -100) die(fmt.Sprintf("Error scraping search results: %s", err.Error()), false, -100)
} }
profile.SaveTweetTrove(trove, true, api.DownloadMedia) full_save_tweet_trove(trove)
happy_exit(fmt.Sprintf("Saved %d tweets and %d users", len(trove.Tweets), len(trove.Users)), err) happy_exit(fmt.Sprintf("Saved %d tweets and %d users", len(trove.Tweets), len(trove.Users)), err)
} }
@ -607,7 +607,7 @@ func fetch_inbox(how_many int) {
if err != nil { if err != nil {
die(fmt.Sprintf("Failed to fetch inbox:\n %s", err.Error()), false, 1) die(fmt.Sprintf("Failed to fetch inbox:\n %s", err.Error()), false, 1)
} }
profile.SaveTweetTrove(trove, true, api.DownloadMedia) full_save_tweet_trove(trove)
happy_exit(fmt.Sprintf("Saved %d messages from %d chats", len(trove.Messages), len(trove.Rooms)), nil) happy_exit(fmt.Sprintf("Saved %d messages from %d chats", len(trove.Messages), len(trove.Rooms)), nil)
} }
@ -621,7 +621,7 @@ func fetch_dm(id string, how_many int) {
if err != nil { if err != nil {
die(fmt.Sprintf("Failed to fetch dm:\n %s", err.Error()), false, 1) die(fmt.Sprintf("Failed to fetch dm:\n %s", err.Error()), false, 1)
} }
profile.SaveTweetTrove(trove, true, api.DownloadMedia) full_save_tweet_trove(trove)
happy_exit( happy_exit(
fmt.Sprintf("Saved %d messages from %d chats", len(trove.Messages), len(trove.Rooms)), fmt.Sprintf("Saved %d messages from %d chats", len(trove.Messages), len(trove.Rooms)),
err, err,
@ -638,7 +638,7 @@ func send_dm(room_id string, text string, in_reply_to_id int) {
if err != nil { if err != nil {
die(fmt.Sprintf("Failed to send dm:\n %s", err.Error()), false, 1) die(fmt.Sprintf("Failed to send dm:\n %s", err.Error()), false, 1)
} }
profile.SaveTweetTrove(trove, true, api.DownloadMedia) full_save_tweet_trove(trove)
happy_exit(fmt.Sprintf("Saved %d messages from %d chats", len(trove.Messages), len(trove.Rooms)), nil) happy_exit(fmt.Sprintf("Saved %d messages from %d chats", len(trove.Messages), len(trove.Rooms)), nil)
} }
@ -670,7 +670,7 @@ func get_notifications(how_many int) {
panic(err) panic(err)
} }
profile.SaveTweetTrove(trove, true, api.DownloadMedia) full_save_tweet_trove(trove)
happy_exit(fmt.Sprintf("Saved %d notifications, %d tweets and %d users", happy_exit(fmt.Sprintf("Saved %d notifications, %d tweets and %d users",
len(trove.Notifications), len(trove.Tweets), len(trove.Users), len(trove.Notifications), len(trove.Tweets), len(trove.Users),
), nil) ), nil)

View File

@ -26,8 +26,7 @@ func (app *Application) Bookmarks(w http.ResponseWriter, r *http.Request) {
panic(err) // Return a toast panic(err) // Return a toast
} }
app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) app.full_save_tweet_trove(trove)
go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia)
} }
c := persistence.NewUserFeedBookmarksCursor(app.ActiveUser.Handle) c := persistence.NewUserFeedBookmarksCursor(app.ActiveUser.Handle)

View File

@ -89,8 +89,7 @@ func (app *Application) after_login(w http.ResponseWriter, r *http.Request, api
http.Redirect(w, r, "/", 303) http.Redirect(w, r, "/", 303)
} }
fmt.Println("Saving initial feed results...") fmt.Println("Saving initial feed results...")
app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) app.full_save_tweet_trove(trove)
go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia)
// Scrape the user's followers // Scrape the user's followers
trove, err = app.API.GetFollowees(user.ID, 1000) trove, err = app.API.GetFollowees(user.ID, 1000)
@ -98,9 +97,8 @@ func (app *Application) after_login(w http.ResponseWriter, r *http.Request, api
app.ErrorLog.Printf("Failed to scrape followers: %s", err.Error()) app.ErrorLog.Printf("Failed to scrape followers: %s", err.Error())
http.Redirect(w, r, "/", 303) http.Redirect(w, r, "/", 303)
} }
app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) app.full_save_tweet_trove(trove)
app.Profile.SaveAsFolloweesList(user.ID, trove) app.Profile.SaveAsFolloweesList(user.ID, trove)
go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia)
// Redirect to Timeline // Redirect to Timeline
http.Redirect(w, r, "/", 303) http.Redirect(w, r, "/", 303)
@ -129,8 +127,7 @@ func (app *Application) ChangeSession(w http.ResponseWriter, r *http.Request) {
return return
} }
// We have to save the notifications first, otherwise it'll just report 0 since the last-read sort index // We have to save the notifications first, otherwise it'll just report 0 since the last-read sort index
app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) app.full_save_tweet_trove(trove)
go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia)
// Set the notifications count // Set the notifications count
app.LastReadNotificationSortIndex = last_unread_notification_sort_index app.LastReadNotificationSortIndex = last_unread_notification_sort_index
}() }()

View File

@ -80,8 +80,7 @@ func (app *Application) message_send(w http.ResponseWriter, r *http.Request) {
if err != nil { if err != nil {
panic(err) panic(err)
} }
app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) app.full_save_tweet_trove(trove)
go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia)
} }
func (app *Application) message_detail(w http.ResponseWriter, r *http.Request) { func (app *Application) message_detail(w http.ResponseWriter, r *http.Request) {
@ -153,8 +152,7 @@ func (app *Application) message_detail(w http.ResponseWriter, r *http.Request) {
if err != nil { if err != nil {
panic(err) panic(err)
} }
app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) app.full_save_tweet_trove(trove)
go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia) // Download the content in the background
} }
// `LatestPollingTimestamp` sort of passes-through the function; if we're not updating it, it // `LatestPollingTimestamp` sort of passes-through the function; if we're not updating it, it
@ -244,8 +242,7 @@ func (app *Application) Messages(w http.ResponseWriter, r *http.Request) {
panic(err) panic(err)
} }
inbox_cursor = new_cursor inbox_cursor = new_cursor
app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) app.full_save_tweet_trove(trove)
go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia)
} }
parts := strings.Split(strings.Trim(r.URL.Path, "/"), "/") parts := strings.Split(strings.Trim(r.URL.Path, "/"), "/")

View File

@ -107,8 +107,7 @@ func (app *Application) Search(w http.ResponseWriter, r *http.Request) {
app.ErrorLog.Print(err) app.ErrorLog.Print(err)
// TOOD: show error in UI // TOOD: show error in UI
} }
app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) app.full_save_tweet_trove(trove)
go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia)
} }
c, err := persistence.NewCursorFromSearchQuery(search_text) c, err := persistence.NewCursorFromSearchQuery(search_text)

View File

@ -53,8 +53,7 @@ func (app *Application) ensure_tweet(id scraper.TweetID, is_forced bool, is_conv
// Save the trove unless there was an unrecoverable error // Save the trove unless there was an unrecoverable error
if err == nil || errors.Is(err, scraper.END_OF_FEED) || errors.Is(err, scraper.ErrRateLimited) { if err == nil || errors.Is(err, scraper.END_OF_FEED) || errors.Is(err, scraper.ErrRateLimited) {
app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) app.full_save_tweet_trove(trove)
go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia) // Download the content in the background
_, is_available = trove.Tweets[id] _, is_available = trove.Tweets[id]
} }

View File

@ -59,16 +59,14 @@ func (app *Application) UserFeed(w http.ResponseWriter, r *http.Request) {
app.ErrorLog.Print(err) app.ErrorLog.Print(err)
// TOOD: show error in UI // TOOD: show error in UI
} }
app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) app.full_save_tweet_trove(trove)
go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia)
} else if len(parts) == 2 && parts[1] == "likes" { } else if len(parts) == 2 && parts[1] == "likes" {
trove, err := app.API.GetUserLikes(user.ID, 50) // TODO: parameterizable trove, err := app.API.GetUserLikes(user.ID, 50) // TODO: parameterizable
if err != nil { if err != nil {
app.ErrorLog.Print(err) app.ErrorLog.Print(err)
// TOOD: show error in UI // TOOD: show error in UI
} }
app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) app.full_save_tweet_trove(trove)
go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia)
} }
} }
@ -171,9 +169,8 @@ func (app *Application) UserFollowees(w http.ResponseWriter, r *http.Request, us
app.ErrorLog.Print(err) app.ErrorLog.Print(err)
// TOOD: show error in UI // TOOD: show error in UI
} }
app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) app.full_save_tweet_trove(trove)
app.Profile.SaveAsFolloweesList(user.ID, trove) app.Profile.SaveAsFolloweesList(user.ID, trove)
go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia)
} }
data, trove := NewFollowsData(app.Profile.GetFollowees(user.ID)) data, trove := NewFollowsData(app.Profile.GetFollowees(user.ID))
@ -197,9 +194,8 @@ func (app *Application) UserFollowers(w http.ResponseWriter, r *http.Request, us
app.ErrorLog.Print(err) app.ErrorLog.Print(err)
// TOOD: show error in UI // TOOD: show error in UI
} }
app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) app.full_save_tweet_trove(trove)
app.Profile.SaveAsFollowersList(user.ID, trove) app.Profile.SaveAsFollowersList(user.ID, trove)
go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia)
} }
data, trove := NewFollowsData(app.Profile.GetFollowers(user.ID)) data, trove := NewFollowsData(app.Profile.GetFollowers(user.ID))

View File

@ -49,8 +49,7 @@ func (t *BackgroundTask) Do() {
// Run the task // Run the task
trove := t.GetTroveFunc(&t.app.API) trove := t.GetTroveFunc(&t.app.API)
t.log.Print("saving results") t.log.Print("saving results")
t.app.Profile.SaveTweetTrove(trove, false, t.app.API.DownloadMedia) t.app.full_save_tweet_trove(trove)
go t.app.Profile.SaveTweetTrove(trove, true, t.app.API.DownloadMedia)
t.log.Print("success") t.log.Print("success")
} }

View File

@ -0,0 +1,38 @@
package webserver
import (
"errors"
"fmt"
. "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
)
// DUPE: full_save_tweet_trove
func (app *Application) full_save_tweet_trove(trove TweetTrove) {
// Save the initial trove
conflicting_users := app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia)
// Handle conflicting users
for _, u_id := range conflicting_users {
app.InfoLog.Printf("Conflicting user handle found (ID %d); old user has been marked deleted. Rescraping manually", u_id)
// Rescrape
updated_user, err := GetUserByID(u_id)
if errors.Is(err, ErrDoesntExist) {
// Mark them as deleted.
// Handle and display name won't be updated if the user exists.
updated_user = User{ID: u_id, DisplayName: "<Unknown User>", Handle: "<UNKNOWN USER>", IsDeleted: true}
} else if err != nil {
panic(fmt.Errorf("error scraping conflicting user (ID %d): %w", u_id, err))
}
err = app.Profile.SaveUser(&updated_user)
if err != nil {
panic(fmt.Errorf(
"error saving rescraped conflicting user with ID %d and handle %q: %w",
updated_user.ID, updated_user.Handle, err,
))
}
}
// Download media content in background
go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia)
}

View File

@ -5,37 +5,18 @@ import (
"fmt" "fmt"
"path" "path"
log "github.com/sirupsen/logrus"
. "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" . "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
) )
// Convenience function that saves all the objects in a TweetTrove. // Convenience function that saves all the objects in a TweetTrove.
// Panics if anything goes wrong. // Returns a list of UserIDs that had conflicting handles with another user.
func (p Profile) SaveTweetTrove(trove TweetTrove, should_download bool, download func(string) ([]byte, error)) { func (p Profile) SaveTweetTrove(trove TweetTrove, should_download bool, download func(string) ([]byte, error)) (conflict_u_ids []UserID) {
for i, u := range trove.Users { for i, u := range trove.Users {
err := p.SaveUser(&u) err := p.SaveUser(&u)
// Check for handle conflicts and handle them in place // Check for user-handle conflicts
// TODO: this is hacky, it doesn't go here. We should return a list of conflicting users
// who were marked as deleted, and then let the callee re-scrape and re-save them.
var conflict_err ErrConflictingUserHandle var conflict_err ErrConflictingUserHandle
if errors.As(err, &conflict_err) { if errors.As(err, &conflict_err) {
log.Warnf( conflict_u_ids = append(conflict_u_ids, conflict_err.ConflictingUserID)
"Conflicting user handle found (ID %d); old user has been marked deleted. Rescraping them\n",
conflict_err.ConflictingUserID,
)
user, err := GetUserByID(conflict_err.ConflictingUserID)
if errors.Is(err, ErrDoesntExist) {
// Mark them as deleted.
// Handle and display name won't be updated if the user exists.
user = User{ID: conflict_err.ConflictingUserID, DisplayName: "<Unknown User>", Handle: "<UNKNOWN USER>", IsDeleted: true}
} else if err != nil {
panic(fmt.Errorf("error scraping conflicting user (ID %d): %w", conflict_err.ConflictingUserID, err))
}
err = p.SaveUser(&user)
if err != nil {
panic(fmt.Errorf("error saving rescraped conflicting user with ID %d and handle %q: %w", user.ID, user.Handle, err))
}
} else if err != nil { } else if err != nil {
panic(fmt.Errorf("Error saving user with ID %d and handle %s:\n %w", u.ID, u.Handle, err)) panic(fmt.Errorf("Error saving user with ID %d and handle %s:\n %w", u.ID, u.Handle, err))
} }
@ -253,4 +234,5 @@ func (p Profile) SaveTweetTrove(trove TweetTrove, should_download bool, download
} }
} }
} }
return conflict_u_ids // If there are any
} }