package persistence import ( "errors" "fmt" "path" log "github.com/sirupsen/logrus" . "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" ) // Convenience function that saves all the objects in a TweetTrove. // Panics if anything goes wrong. func (p Profile) SaveTweetTrove(trove TweetTrove, should_download bool, api *API) { for i, u := range trove.Users { err := p.SaveUser(&u) // Check for handle conflicts and handle them in place // TODO: this is hacky, it doesn't go here. We should return a list of conflicting users // who were marked as deleted, and then let the callee re-scrape and re-save them. var conflict_err ErrConflictingUserHandle if errors.As(err, &conflict_err) { log.Warnf( "Conflicting user handle found (ID %d); old user has been marked deleted. Rescraping them\n", conflict_err.ConflictingUserID, ) user, err := GetUserByID(conflict_err.ConflictingUserID) if errors.Is(err, ErrDoesntExist) { // Mark them as deleted. // Handle and display name won't be updated if the user exists. user = User{ID: conflict_err.ConflictingUserID, DisplayName: "", Handle: "", IsDeleted: true} } else if err != nil { panic(fmt.Errorf("error scraping conflicting user (ID %d): %w", conflict_err.ConflictingUserID, err)) } err = p.SaveUser(&user) if err != nil { panic(fmt.Errorf("error saving rescraped conflicting user with ID %d and handle %q: %w", user.ID, user.Handle, err)) } } else if err != nil { panic(fmt.Errorf("Error saving user with ID %d and handle %s:\n %w", u.ID, u.Handle, err)) } fmt.Println(u.Handle, u.ID) // If the User's ID was updated in saving (i.e., Unknown User), update it in the Trove too // Also update tweets, retweets and spaces that reference this UserID for j, tweet := range trove.Tweets { if tweet.UserID == trove.Users[i].ID { tweet.UserID = u.ID trove.Tweets[j] = tweet } } for j, retweet := range trove.Retweets { if retweet.RetweetedByID == trove.Users[i].ID { retweet.RetweetedByID = u.ID trove.Retweets[j] = retweet } } for j, space := range trove.Spaces { if space.CreatedById == trove.Users[i].ID { space.CreatedById = u.ID trove.Spaces[j] = space } } trove.Users[i] = u if should_download { // Download their tiny profile image err = p.DownloadUserProfileImageTiny(&u, api.DownloadMedia) if errors.Is(err, ErrRequestTimeout) { // Forget about it; if it's important someone will try again fmt.Printf("Failed to @%s's tiny profile image (%q): %s\n", u.Handle, u.ProfileImageUrl, err.Error()) } else if err != nil { panic(fmt.Errorf("Error downloading user content for user with ID %d and handle %s:\n %w", u.ID, u.Handle, err)) } } } for _, s := range trove.Spaces { err := p.SaveSpace(s) if err != nil { panic(fmt.Errorf("Error saving space with ID %s:\n %w", s.ID, err)) } } for _, t := range trove.Tweets { err := p.SaveTweet(t) if err != nil { panic(fmt.Errorf("Error saving tweet ID %d:\n %w", t.ID, err)) } if should_download { err = p.DownloadTweetContentFor(&t, api.DownloadMedia) if errors.Is(err, ErrRequestTimeout) || errors.Is(err, ErrMediaDownload404) { // Forget about it; if it's important someone will try again fmt.Printf("Failed to download tweet ID %d: %s\n", t.ID, err.Error()) } else if err != nil { panic(fmt.Errorf("Error downloading tweet content for tweet ID %d:\n %w", t.ID, err)) } } } for _, r := range trove.Retweets { err := p.SaveRetweet(r) if err != nil { panic(fmt.Errorf("Error saving retweet with ID %d from user ID %d:\n %w", r.RetweetID, r.RetweetedByID, err)) } } for _, l := range trove.Likes { err := p.SaveLike(l) if err != nil { panic(fmt.Errorf("Error saving Like: %#v\n %w", l, err)) } } for _, b := range trove.Bookmarks { err := p.SaveBookmark(b) if err != nil { panic(fmt.Errorf("Error saving Bookmark: %#v\n %w", b, err)) } } for _, n := range trove.Notifications { p.SaveNotification(n) } // DM related content // ------------------ for _, r := range trove.Rooms { err := p.SaveChatRoom(r) if err != nil { panic(fmt.Errorf("Error saving chat room: %#v\n %w", r, err)) } } for _, m := range trove.Messages { err := p.SaveChatMessage(m) if err != nil { panic(fmt.Errorf("Error saving chat message: %#v\n %w", m, err)) } // TODO: all of this is very duplicated and should be refactored // Copied from media_download.go functions: // - download_tweet_image, download_tweet_video, download_link_thumbnail // - DownloadTweetContentWithInjector // Copied from tweet_queries.go functions: // - CheckTweetContentDownloadNeeded // Download content if needed if should_download { downloader := DefaultDownloader{Download: api.DownloadMedia} for _, img := range m.Images { // Check if it's already downloaded var is_downloaded bool err := p.DB.Get(&is_downloaded, `select is_downloaded from chat_message_images where id = ?`, img.ID) if err != nil { panic(err) } if is_downloaded { // Already downloaded; skip continue } // DUPE: download-image outfile := path.Join(p.ProfileDir, "images", img.LocalFilename) err = downloader.Curl(img.RemoteURL, outfile) if errors.Is(err, ErrRequestTimeout) { // Forget about it; if it's important someone will try again fmt.Printf("Failed to download image %q: %s\n", img.RemoteURL, err.Error()) } else if err != nil { panic(fmt.Errorf("downloading image %q on DM message %d:\n %w", img.RemoteURL, m.ID, err)) } _, err = p.DB.NamedExec(`update chat_message_images set is_downloaded = 1 where id = :id`, img) if err != nil { panic(err) } } for _, vid := range m.Videos { // Videos can be geoblocked, and the HTTP response isn't in JSON so it's hard to capture if vid.IsGeoblocked { continue } // Check if it's already downloaded var is_downloaded bool err := p.DB.Get(&is_downloaded, `select is_downloaded from chat_message_videos where id = ?`, vid.ID) if err != nil { panic(err) } if is_downloaded { // Already downloaded; skip continue } // DUPE: download-video // Download the video outfile := path.Join(p.ProfileDir, "videos", vid.LocalFilename) err = downloader.Curl(vid.RemoteURL, outfile) if errors.Is(err, ErrRequestTimeout) { // Forget about it; if it's important someone will try again fmt.Printf("Failed to download video %q: %s\n", vid.RemoteURL, err.Error()) } else if errors.Is(err, ErrorDMCA) { vid.IsDownloaded = false vid.IsBlockedByDMCA = true } else if err != nil { panic(fmt.Errorf("downloading video %q on DM message %d:\n %w", vid.RemoteURL, m.ID, err)) } else { vid.IsDownloaded = true } // Download the thumbnail outfile = path.Join(p.ProfileDir, "video_thumbnails", vid.ThumbnailLocalPath) err = downloader.Curl(vid.ThumbnailRemoteUrl, outfile) if errors.Is(err, ErrRequestTimeout) { // Forget about it; if it's important someone will try again fmt.Printf("Failed to download video thumbnail %q: %s\n", vid.ThumbnailRemoteUrl, err.Error()) } else if err != nil { panic(fmt.Errorf("Error downloading video thumbnail (DMMessageID %d):\n %w", vid.DMMessageID, err)) } // Update it in the DB _, err = p.DB.NamedExec(` update chat_message_videos set is_downloaded = :is_downloaded, is_blocked_by_dmca = :is_blocked_by_dmca where id = :id `, vid) if err != nil { panic(err) } } for _, url := range m.Urls { // DUPE: download-link-thumbnail if url.HasCard && url.HasThumbnail { outfile := path.Join(p.ProfileDir, "link_preview_images", url.ThumbnailLocalPath) err := downloader.Curl(url.ThumbnailRemoteUrl, outfile) if errors.Is(err, ErrRequestTimeout) { // Forget about it; if it's important someone will try again fmt.Printf("Failed to download link thumbnail %q: %s\n", url.ThumbnailRemoteUrl, err.Error()) } else if err != nil { panic(fmt.Errorf("downloading link thumbnail %q on DM message %d:\n %w", url.ThumbnailRemoteUrl, m.ID, err)) } } url.IsContentDownloaded = true // Update it in the DB _, err = p.DB.NamedExec(` update chat_message_urls set is_content_downloaded = :is_content_downloaded where chat_message_id = :chat_message_id `, url) if err != nil { panic(err) } } } } }