offline-twitter/pkg/persistence/tweet_trove_queries.go

257 lines
8.4 KiB
Go

package persistence
import (
"errors"
"fmt"
"path"
log "github.com/sirupsen/logrus"
. "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
)
// Convenience function that saves all the objects in a TweetTrove.
// Panics if anything goes wrong.
func (p Profile) SaveTweetTrove(trove TweetTrove, should_download bool, api *API) {
for i, u := range trove.Users {
err := p.SaveUser(&u)
// Check for handle conflicts and handle them in place
// TODO: this is hacky, it doesn't go here. We should return a list of conflicting users
// who were marked as deleted, and then let the callee re-scrape and re-save them.
var conflict_err ErrConflictingUserHandle
if errors.As(err, &conflict_err) {
log.Warnf(
"Conflicting user handle found (ID %d); old user has been marked deleted. Rescraping them\n",
conflict_err.ConflictingUserID,
)
user, err := GetUserByID(conflict_err.ConflictingUserID)
if errors.Is(err, ErrDoesntExist) {
// Mark them as deleted.
// Handle and display name won't be updated if the user exists.
user = User{ID: conflict_err.ConflictingUserID, DisplayName: "<Unknown User>", Handle: "<UNKNOWN USER>", IsDeleted: true}
} else if err != nil {
panic(fmt.Errorf("error scraping conflicting user (ID %d): %w", conflict_err.ConflictingUserID, err))
}
err = p.SaveUser(&user)
if err != nil {
panic(fmt.Errorf("error saving rescraped conflicting user with ID %d and handle %q: %w", user.ID, user.Handle, err))
}
} else if err != nil {
panic(fmt.Errorf("Error saving user with ID %d and handle %s:\n %w", u.ID, u.Handle, err))
}
fmt.Println(u.Handle, u.ID)
// If the User's ID was updated in saving (i.e., Unknown User), update it in the Trove too
// Also update tweets, retweets and spaces that reference this UserID
for j, tweet := range trove.Tweets {
if tweet.UserID == trove.Users[i].ID {
tweet.UserID = u.ID
trove.Tweets[j] = tweet
}
}
for j, retweet := range trove.Retweets {
if retweet.RetweetedByID == trove.Users[i].ID {
retweet.RetweetedByID = u.ID
trove.Retweets[j] = retweet
}
}
for j, space := range trove.Spaces {
if space.CreatedById == trove.Users[i].ID {
space.CreatedById = u.ID
trove.Spaces[j] = space
}
}
trove.Users[i] = u
if should_download {
// Download their tiny profile image
err = p.DownloadUserProfileImageTiny(&u, api.DownloadMedia)
if errors.Is(err, ErrRequestTimeout) {
// Forget about it; if it's important someone will try again
fmt.Printf("Failed to @%s's tiny profile image (%q): %s\n", u.Handle, u.ProfileImageUrl, err.Error())
} else if err != nil {
panic(fmt.Errorf("Error downloading user content for user with ID %d and handle %s:\n %w", u.ID, u.Handle, err))
}
}
}
for _, s := range trove.Spaces {
err := p.SaveSpace(s)
if err != nil {
panic(fmt.Errorf("Error saving space with ID %s:\n %w", s.ID, err))
}
}
for _, t := range trove.Tweets {
err := p.SaveTweet(t)
if err != nil {
panic(fmt.Errorf("Error saving tweet ID %d:\n %w", t.ID, err))
}
if should_download {
err = p.DownloadTweetContentFor(&t, api.DownloadMedia)
if errors.Is(err, ErrRequestTimeout) || errors.Is(err, ErrMediaDownload404) {
// Forget about it; if it's important someone will try again
fmt.Printf("Failed to download tweet ID %d: %s\n", t.ID, err.Error())
} else if err != nil {
panic(fmt.Errorf("Error downloading tweet content for tweet ID %d:\n %w", t.ID, err))
}
}
}
for _, r := range trove.Retweets {
err := p.SaveRetweet(r)
if err != nil {
panic(fmt.Errorf("Error saving retweet with ID %d from user ID %d:\n %w", r.RetweetID, r.RetweetedByID, err))
}
}
for _, l := range trove.Likes {
err := p.SaveLike(l)
if err != nil {
panic(fmt.Errorf("Error saving Like: %#v\n %w", l, err))
}
}
for _, b := range trove.Bookmarks {
err := p.SaveBookmark(b)
if err != nil {
panic(fmt.Errorf("Error saving Bookmark: %#v\n %w", b, err))
}
}
for _, n := range trove.Notifications {
p.SaveNotification(n)
}
// DM related content
// ------------------
for _, r := range trove.Rooms {
err := p.SaveChatRoom(r)
if err != nil {
panic(fmt.Errorf("Error saving chat room: %#v\n %w", r, err))
}
}
for _, m := range trove.Messages {
err := p.SaveChatMessage(m)
if err != nil {
panic(fmt.Errorf("Error saving chat message: %#v\n %w", m, err))
}
// TODO: all of this is very duplicated and should be refactored
// Copied from media_download.go functions:
// - download_tweet_image, download_tweet_video, download_link_thumbnail
// - DownloadTweetContentWithInjector
// Copied from tweet_queries.go functions:
// - CheckTweetContentDownloadNeeded
// Download content if needed
if should_download {
downloader := DefaultDownloader{Download: api.DownloadMedia}
for _, img := range m.Images {
// Check if it's already downloaded
var is_downloaded bool
err := p.DB.Get(&is_downloaded, `select is_downloaded from chat_message_images where id = ?`, img.ID)
if err != nil {
panic(err)
}
if is_downloaded {
// Already downloaded; skip
continue
}
// DUPE: download-image
outfile := path.Join(p.ProfileDir, "images", img.LocalFilename)
err = downloader.Curl(img.RemoteURL, outfile)
if errors.Is(err, ErrRequestTimeout) {
// Forget about it; if it's important someone will try again
fmt.Printf("Failed to download image %q: %s\n", img.RemoteURL, err.Error())
} else if err != nil {
panic(fmt.Errorf("downloading image %q on DM message %d:\n %w", img.RemoteURL, m.ID, err))
}
_, err = p.DB.NamedExec(`update chat_message_images set is_downloaded = 1 where id = :id`, img)
if err != nil {
panic(err)
}
}
for _, vid := range m.Videos {
// Videos can be geoblocked, and the HTTP response isn't in JSON so it's hard to capture
if vid.IsGeoblocked {
continue
}
// Check if it's already downloaded
var is_downloaded bool
err := p.DB.Get(&is_downloaded, `select is_downloaded from chat_message_videos where id = ?`, vid.ID)
if err != nil {
panic(err)
}
if is_downloaded {
// Already downloaded; skip
continue
}
// DUPE: download-video
// Download the video
outfile := path.Join(p.ProfileDir, "videos", vid.LocalFilename)
err = downloader.Curl(vid.RemoteURL, outfile)
if errors.Is(err, ErrRequestTimeout) {
// Forget about it; if it's important someone will try again
fmt.Printf("Failed to download video %q: %s\n", vid.RemoteURL, err.Error())
} else if errors.Is(err, ErrorDMCA) {
vid.IsDownloaded = false
vid.IsBlockedByDMCA = true
} else if err != nil {
panic(fmt.Errorf("downloading video %q on DM message %d:\n %w", vid.RemoteURL, m.ID, err))
} else {
vid.IsDownloaded = true
}
// Download the thumbnail
outfile = path.Join(p.ProfileDir, "video_thumbnails", vid.ThumbnailLocalPath)
err = downloader.Curl(vid.ThumbnailRemoteUrl, outfile)
if errors.Is(err, ErrRequestTimeout) {
// Forget about it; if it's important someone will try again
fmt.Printf("Failed to download video thumbnail %q: %s\n", vid.ThumbnailRemoteUrl, err.Error())
} else if err != nil {
panic(fmt.Errorf("Error downloading video thumbnail (DMMessageID %d):\n %w", vid.DMMessageID, err))
}
// Update it in the DB
_, err = p.DB.NamedExec(`
update chat_message_videos set is_downloaded = :is_downloaded, is_blocked_by_dmca = :is_blocked_by_dmca where id = :id
`, vid)
if err != nil {
panic(err)
}
}
for _, url := range m.Urls {
// DUPE: download-link-thumbnail
if url.HasCard && url.HasThumbnail {
outfile := path.Join(p.ProfileDir, "link_preview_images", url.ThumbnailLocalPath)
err := downloader.Curl(url.ThumbnailRemoteUrl, outfile)
if errors.Is(err, ErrRequestTimeout) {
// Forget about it; if it's important someone will try again
fmt.Printf("Failed to download link thumbnail %q: %s\n", url.ThumbnailRemoteUrl, err.Error())
} else if err != nil {
panic(fmt.Errorf("downloading link thumbnail %q on DM message %d:\n %w", url.ThumbnailRemoteUrl, m.ID, err))
}
}
url.IsContentDownloaded = true
// Update it in the DB
_, err = p.DB.NamedExec(`
update chat_message_urls set is_content_downloaded = :is_content_downloaded where chat_message_id = :chat_message_id
`, url)
if err != nil {
panic(err)
}
}
}
}
}