From 4132eb4bcaa1fd18c1d14bf725e3da331de12bba Mon Sep 17 00:00:00 2001 From: Alessio Date: Mon, 10 Feb 2025 12:53:19 -0800 Subject: [PATCH] REFACTOR: `SaveTweetTrove` no longer calls `scraper.GetUserByID`, removing another scraper dependency from `persistence` package - it was calling it to handle conflicting user handles - now user handle conflicts are handled by the callee - Add a utility function in `main` and `webserver` packages to rescrape the conflicting users --- cmd/twitter/helpers.go | 27 +++++++++++++++ cmd/twitter/main.go | 24 +++++++------- internal/webserver/handler_bookmarks.go | 3 +- internal/webserver/handler_login.go | 9 ++--- internal/webserver/handler_messages.go | 9 ++--- internal/webserver/handler_search.go | 3 +- internal/webserver/handler_tweet_detail.go | 3 +- internal/webserver/handler_user_feed.go | 12 +++---- internal/webserver/stopwatch.go | 3 +- internal/webserver/temp_utils.go | 38 ++++++++++++++++++++++ pkg/persistence/tweet_trove_queries.go | 28 +++------------- 11 files changed, 96 insertions(+), 63 deletions(-) create mode 100644 internal/webserver/temp_utils.go diff --git a/cmd/twitter/helpers.go b/cmd/twitter/helpers.go index a118279..00d7690 100644 --- a/cmd/twitter/helpers.go +++ b/cmd/twitter/helpers.go @@ -96,3 +96,30 @@ func is_scrape_failure(err error) bool { } return true } + +// DUPE: full_save_tweet_trove +func full_save_tweet_trove(trove scraper.TweetTrove) { + conflicting_users := profile.SaveTweetTrove(trove, true, api.DownloadMedia) + for _, u_id := range conflicting_users { + fmt.Printf(terminal_utils.COLOR_YELLOW+ + "Conflicting user handle found (ID %d); old user has been marked deleted. Rescraping manually"+ + terminal_utils.COLOR_RESET+"\n", + u_id) + // Rescrape + updated_user, err := scraper.GetUserByID(u_id) + if errors.Is(err, scraper.ErrDoesntExist) { + // Mark them as deleted. + // Handle and display name won't be updated if the user exists. + updated_user = scraper.User{ID: u_id, DisplayName: "", Handle: "", IsDeleted: true} + } else if err != nil { + panic(fmt.Errorf("error scraping conflicting user (ID %d): %w", u_id, err)) + } + err = profile.SaveUser(&updated_user) + if err != nil { + panic(fmt.Errorf( + "error saving rescraped conflicting user with ID %d and handle %q: %w", + updated_user.ID, updated_user.Handle, err, + )) + } + } +} diff --git a/cmd/twitter/main.go b/cmd/twitter/main.go index da679ae..feff547 100644 --- a/cmd/twitter/main.go +++ b/cmd/twitter/main.go @@ -410,7 +410,7 @@ func fetch_tweet_conversation(tweet_identifier string, how_many int) { if is_scrape_failure(err) { die(err.Error(), false, -1) } - profile.SaveTweetTrove(trove, true, api.DownloadMedia) + full_save_tweet_trove(trove) happy_exit(fmt.Sprintf("Saved %d tweets and %d users", len(trove.Tweets), len(trove.Users)), err) } @@ -431,7 +431,7 @@ func fetch_user_feed(handle string, how_many int) { if is_scrape_failure(err) { die(fmt.Sprintf("Error scraping feed: %s\n %s", handle, err.Error()), false, -2) } - profile.SaveTweetTrove(trove, true, api.DownloadMedia) + full_save_tweet_trove(trove) happy_exit( fmt.Sprintf("Saved %d tweets, %d retweets and %d users", len(trove.Tweets), len(trove.Retweets), len(trove.Users)), @@ -449,7 +449,7 @@ func get_user_likes(handle string, how_many int) { if is_scrape_failure(err) { die(fmt.Sprintf("Error scraping feed: %s\n %s", handle, err.Error()), false, -2) } - profile.SaveTweetTrove(trove, true, api.DownloadMedia) + full_save_tweet_trove(trove) happy_exit( fmt.Sprintf("Saved %d tweets, %d retweets and %d users", len(trove.Tweets), len(trove.Retweets), len(trove.Users)), @@ -467,7 +467,7 @@ func get_followees(handle string, how_many int) { if is_scrape_failure(err) { die(fmt.Sprintf("Error getting followees: %s\n %s", handle, err.Error()), false, -2) } - profile.SaveTweetTrove(trove, true, api.DownloadMedia) + full_save_tweet_trove(trove) profile.SaveAsFolloweesList(user.ID, trove) happy_exit(fmt.Sprintf("Saved %d followees", len(trove.Users)), err) @@ -481,7 +481,7 @@ func get_followers(handle string, how_many int) { if is_scrape_failure(err) { die(fmt.Sprintf("Error getting followees: %s\n %s", handle, err.Error()), false, -2) } - profile.SaveTweetTrove(trove, true, api.DownloadMedia) + full_save_tweet_trove(trove) profile.SaveAsFollowersList(user.ID, trove) happy_exit(fmt.Sprintf("Saved %d followers", len(trove.Users)), err) @@ -491,7 +491,7 @@ func get_bookmarks(how_many int) { if is_scrape_failure(err) { die(fmt.Sprintf("Error scraping bookmarks:\n %s", err.Error()), false, -2) } - profile.SaveTweetTrove(trove, true, api.DownloadMedia) + full_save_tweet_trove(trove) happy_exit(fmt.Sprintf( "Saved %d tweets, %d retweets, %d users, and %d bookmarks", @@ -504,7 +504,7 @@ func fetch_timeline(is_following_only bool) { if is_scrape_failure(err) { die(fmt.Sprintf("Error fetching timeline:\n %s", err.Error()), false, -2) } - profile.SaveTweetTrove(trove, true, api.DownloadMedia) + full_save_tweet_trove(trove) happy_exit( fmt.Sprintf("Saved %d tweets, %d retweets and %d users", len(trove.Tweets), len(trove.Retweets), len(trove.Users)), @@ -544,7 +544,7 @@ func search(query string, how_many int) { if is_scrape_failure(err) { die(fmt.Sprintf("Error scraping search results: %s", err.Error()), false, -100) } - profile.SaveTweetTrove(trove, true, api.DownloadMedia) + full_save_tweet_trove(trove) happy_exit(fmt.Sprintf("Saved %d tweets and %d users", len(trove.Tweets), len(trove.Users)), err) } @@ -607,7 +607,7 @@ func fetch_inbox(how_many int) { if err != nil { die(fmt.Sprintf("Failed to fetch inbox:\n %s", err.Error()), false, 1) } - profile.SaveTweetTrove(trove, true, api.DownloadMedia) + full_save_tweet_trove(trove) happy_exit(fmt.Sprintf("Saved %d messages from %d chats", len(trove.Messages), len(trove.Rooms)), nil) } @@ -621,7 +621,7 @@ func fetch_dm(id string, how_many int) { if err != nil { die(fmt.Sprintf("Failed to fetch dm:\n %s", err.Error()), false, 1) } - profile.SaveTweetTrove(trove, true, api.DownloadMedia) + full_save_tweet_trove(trove) happy_exit( fmt.Sprintf("Saved %d messages from %d chats", len(trove.Messages), len(trove.Rooms)), err, @@ -638,7 +638,7 @@ func send_dm(room_id string, text string, in_reply_to_id int) { if err != nil { die(fmt.Sprintf("Failed to send dm:\n %s", err.Error()), false, 1) } - profile.SaveTweetTrove(trove, true, api.DownloadMedia) + full_save_tweet_trove(trove) happy_exit(fmt.Sprintf("Saved %d messages from %d chats", len(trove.Messages), len(trove.Rooms)), nil) } @@ -670,7 +670,7 @@ func get_notifications(how_many int) { panic(err) } - profile.SaveTweetTrove(trove, true, api.DownloadMedia) + full_save_tweet_trove(trove) happy_exit(fmt.Sprintf("Saved %d notifications, %d tweets and %d users", len(trove.Notifications), len(trove.Tweets), len(trove.Users), ), nil) diff --git a/internal/webserver/handler_bookmarks.go b/internal/webserver/handler_bookmarks.go index 1afdd29..a759ab8 100644 --- a/internal/webserver/handler_bookmarks.go +++ b/internal/webserver/handler_bookmarks.go @@ -26,8 +26,7 @@ func (app *Application) Bookmarks(w http.ResponseWriter, r *http.Request) { panic(err) // Return a toast } - app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) - go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia) + app.full_save_tweet_trove(trove) } c := persistence.NewUserFeedBookmarksCursor(app.ActiveUser.Handle) diff --git a/internal/webserver/handler_login.go b/internal/webserver/handler_login.go index f860de8..3b73e23 100644 --- a/internal/webserver/handler_login.go +++ b/internal/webserver/handler_login.go @@ -89,8 +89,7 @@ func (app *Application) after_login(w http.ResponseWriter, r *http.Request, api http.Redirect(w, r, "/", 303) } fmt.Println("Saving initial feed results...") - app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) - go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia) + app.full_save_tweet_trove(trove) // Scrape the user's followers trove, err = app.API.GetFollowees(user.ID, 1000) @@ -98,9 +97,8 @@ func (app *Application) after_login(w http.ResponseWriter, r *http.Request, api app.ErrorLog.Printf("Failed to scrape followers: %s", err.Error()) http.Redirect(w, r, "/", 303) } - app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) + app.full_save_tweet_trove(trove) app.Profile.SaveAsFolloweesList(user.ID, trove) - go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia) // Redirect to Timeline http.Redirect(w, r, "/", 303) @@ -129,8 +127,7 @@ func (app *Application) ChangeSession(w http.ResponseWriter, r *http.Request) { return } // We have to save the notifications first, otherwise it'll just report 0 since the last-read sort index - app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) - go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia) + app.full_save_tweet_trove(trove) // Set the notifications count app.LastReadNotificationSortIndex = last_unread_notification_sort_index }() diff --git a/internal/webserver/handler_messages.go b/internal/webserver/handler_messages.go index c027e27..5e29b85 100644 --- a/internal/webserver/handler_messages.go +++ b/internal/webserver/handler_messages.go @@ -80,8 +80,7 @@ func (app *Application) message_send(w http.ResponseWriter, r *http.Request) { if err != nil { panic(err) } - app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) - go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia) + app.full_save_tweet_trove(trove) } func (app *Application) message_detail(w http.ResponseWriter, r *http.Request) { @@ -153,8 +152,7 @@ func (app *Application) message_detail(w http.ResponseWriter, r *http.Request) { if err != nil { panic(err) } - app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) - go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia) // Download the content in the background + app.full_save_tweet_trove(trove) } // `LatestPollingTimestamp` sort of passes-through the function; if we're not updating it, it @@ -244,8 +242,7 @@ func (app *Application) Messages(w http.ResponseWriter, r *http.Request) { panic(err) } inbox_cursor = new_cursor - app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) - go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia) + app.full_save_tweet_trove(trove) } parts := strings.Split(strings.Trim(r.URL.Path, "/"), "/") diff --git a/internal/webserver/handler_search.go b/internal/webserver/handler_search.go index ab53443..372e8a3 100644 --- a/internal/webserver/handler_search.go +++ b/internal/webserver/handler_search.go @@ -107,8 +107,7 @@ func (app *Application) Search(w http.ResponseWriter, r *http.Request) { app.ErrorLog.Print(err) // TOOD: show error in UI } - app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) - go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia) + app.full_save_tweet_trove(trove) } c, err := persistence.NewCursorFromSearchQuery(search_text) diff --git a/internal/webserver/handler_tweet_detail.go b/internal/webserver/handler_tweet_detail.go index 7cb8a4b..816be88 100644 --- a/internal/webserver/handler_tweet_detail.go +++ b/internal/webserver/handler_tweet_detail.go @@ -53,8 +53,7 @@ func (app *Application) ensure_tweet(id scraper.TweetID, is_forced bool, is_conv // Save the trove unless there was an unrecoverable error if err == nil || errors.Is(err, scraper.END_OF_FEED) || errors.Is(err, scraper.ErrRateLimited) { - app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) - go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia) // Download the content in the background + app.full_save_tweet_trove(trove) _, is_available = trove.Tweets[id] } diff --git a/internal/webserver/handler_user_feed.go b/internal/webserver/handler_user_feed.go index bb200d1..73cbdf8 100644 --- a/internal/webserver/handler_user_feed.go +++ b/internal/webserver/handler_user_feed.go @@ -59,16 +59,14 @@ func (app *Application) UserFeed(w http.ResponseWriter, r *http.Request) { app.ErrorLog.Print(err) // TOOD: show error in UI } - app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) - go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia) + app.full_save_tweet_trove(trove) } else if len(parts) == 2 && parts[1] == "likes" { trove, err := app.API.GetUserLikes(user.ID, 50) // TODO: parameterizable if err != nil { app.ErrorLog.Print(err) // TOOD: show error in UI } - app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) - go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia) + app.full_save_tweet_trove(trove) } } @@ -171,9 +169,8 @@ func (app *Application) UserFollowees(w http.ResponseWriter, r *http.Request, us app.ErrorLog.Print(err) // TOOD: show error in UI } - app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) + app.full_save_tweet_trove(trove) app.Profile.SaveAsFolloweesList(user.ID, trove) - go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia) } data, trove := NewFollowsData(app.Profile.GetFollowees(user.ID)) @@ -197,9 +194,8 @@ func (app *Application) UserFollowers(w http.ResponseWriter, r *http.Request, us app.ErrorLog.Print(err) // TOOD: show error in UI } - app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) + app.full_save_tweet_trove(trove) app.Profile.SaveAsFollowersList(user.ID, trove) - go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia) } data, trove := NewFollowsData(app.Profile.GetFollowers(user.ID)) diff --git a/internal/webserver/stopwatch.go b/internal/webserver/stopwatch.go index 49b2c24..bcd96ca 100644 --- a/internal/webserver/stopwatch.go +++ b/internal/webserver/stopwatch.go @@ -49,8 +49,7 @@ func (t *BackgroundTask) Do() { // Run the task trove := t.GetTroveFunc(&t.app.API) t.log.Print("saving results") - t.app.Profile.SaveTweetTrove(trove, false, t.app.API.DownloadMedia) - go t.app.Profile.SaveTweetTrove(trove, true, t.app.API.DownloadMedia) + t.app.full_save_tweet_trove(trove) t.log.Print("success") } diff --git a/internal/webserver/temp_utils.go b/internal/webserver/temp_utils.go new file mode 100644 index 0000000..8666b2b --- /dev/null +++ b/internal/webserver/temp_utils.go @@ -0,0 +1,38 @@ +package webserver + +import ( + "errors" + "fmt" + + . "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" +) + +// DUPE: full_save_tweet_trove +func (app *Application) full_save_tweet_trove(trove TweetTrove) { + // Save the initial trove + conflicting_users := app.Profile.SaveTweetTrove(trove, false, app.API.DownloadMedia) + + // Handle conflicting users + for _, u_id := range conflicting_users { + app.InfoLog.Printf("Conflicting user handle found (ID %d); old user has been marked deleted. Rescraping manually", u_id) + // Rescrape + updated_user, err := GetUserByID(u_id) + if errors.Is(err, ErrDoesntExist) { + // Mark them as deleted. + // Handle and display name won't be updated if the user exists. + updated_user = User{ID: u_id, DisplayName: "", Handle: "", IsDeleted: true} + } else if err != nil { + panic(fmt.Errorf("error scraping conflicting user (ID %d): %w", u_id, err)) + } + err = app.Profile.SaveUser(&updated_user) + if err != nil { + panic(fmt.Errorf( + "error saving rescraped conflicting user with ID %d and handle %q: %w", + updated_user.ID, updated_user.Handle, err, + )) + } + } + + // Download media content in background + go app.Profile.SaveTweetTrove(trove, true, app.API.DownloadMedia) +} diff --git a/pkg/persistence/tweet_trove_queries.go b/pkg/persistence/tweet_trove_queries.go index 0dba9f2..3eb5493 100644 --- a/pkg/persistence/tweet_trove_queries.go +++ b/pkg/persistence/tweet_trove_queries.go @@ -5,37 +5,18 @@ import ( "fmt" "path" - log "github.com/sirupsen/logrus" - . "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" ) // Convenience function that saves all the objects in a TweetTrove. -// Panics if anything goes wrong. -func (p Profile) SaveTweetTrove(trove TweetTrove, should_download bool, download func(string) ([]byte, error)) { +// Returns a list of UserIDs that had conflicting handles with another user. +func (p Profile) SaveTweetTrove(trove TweetTrove, should_download bool, download func(string) ([]byte, error)) (conflict_u_ids []UserID) { for i, u := range trove.Users { err := p.SaveUser(&u) - // Check for handle conflicts and handle them in place - // TODO: this is hacky, it doesn't go here. We should return a list of conflicting users - // who were marked as deleted, and then let the callee re-scrape and re-save them. + // Check for user-handle conflicts var conflict_err ErrConflictingUserHandle if errors.As(err, &conflict_err) { - log.Warnf( - "Conflicting user handle found (ID %d); old user has been marked deleted. Rescraping them\n", - conflict_err.ConflictingUserID, - ) - user, err := GetUserByID(conflict_err.ConflictingUserID) - if errors.Is(err, ErrDoesntExist) { - // Mark them as deleted. - // Handle and display name won't be updated if the user exists. - user = User{ID: conflict_err.ConflictingUserID, DisplayName: "", Handle: "", IsDeleted: true} - } else if err != nil { - panic(fmt.Errorf("error scraping conflicting user (ID %d): %w", conflict_err.ConflictingUserID, err)) - } - err = p.SaveUser(&user) - if err != nil { - panic(fmt.Errorf("error saving rescraped conflicting user with ID %d and handle %q: %w", user.ID, user.Handle, err)) - } + conflict_u_ids = append(conflict_u_ids, conflict_err.ConflictingUserID) } else if err != nil { panic(fmt.Errorf("Error saving user with ID %d and handle %s:\n %w", u.ID, u.Handle, err)) } @@ -253,4 +234,5 @@ func (p Profile) SaveTweetTrove(trove TweetTrove, should_download bool, download } } } + return conflict_u_ids // If there are any }