diff --git a/pkg/scraper/api_types.go b/pkg/scraper/api_types.go index 1806e03..cae2874 100644 --- a/pkg/scraper/api_types.go +++ b/pkg/scraper/api_types.go @@ -4,6 +4,10 @@ import ( "encoding/json" "fmt" "html" + "log" + "net/url" + "path" + "regexp" "sort" "strconv" "strings" @@ -21,6 +25,19 @@ type APIMedia struct { } `json:"original_info"` } +func ParseAPIMedia(apiMedia APIMedia) Image { + local_filename := get_prefixed_path(path.Base(apiMedia.MediaURLHttps)) + + return Image{ + ID: ImageID(apiMedia.ID), + RemoteURL: apiMedia.MediaURLHttps, + Width: apiMedia.OriginalInfo.Width, + Height: apiMedia.OriginalInfo.Height, + LocalFilename: local_filename, + IsDownloaded: false, + } +} + type SortableVariants []struct { Bitrate int `json:"bitrate,omitempty"` URL string `json:"url"` @@ -137,6 +154,164 @@ type APICard struct { } `json:"binding_values"` } +func ParseAPIPoll(apiCard APICard) Poll { + card_url, err := url.Parse(apiCard.ShortenedUrl) + if err != nil { + panic(err) + } + id := int_or_panic(card_url.Hostname()) + + ret := Poll{} + ret.ID = PollID(id) + ret.NumChoices = parse_num_choices(apiCard.Name) + ret.VotingDuration = int_or_panic(apiCard.BindingValues.DurationMinutes.StringValue) * 60 + ret.VotingEndsAt, err = TimestampFromString(apiCard.BindingValues.EndDatetimeUTC.StringValue) + if err != nil { + panic(err) + } + ret.LastUpdatedAt, err = TimestampFromString(apiCard.BindingValues.LastUpdatedAt.StringValue) + if err != nil { + panic(err) + } + + ret.Choice1 = apiCard.BindingValues.Choice1.StringValue + ret.Choice1_Votes = int_or_panic(apiCard.BindingValues.Choice1_Count.StringValue) + ret.Choice2 = apiCard.BindingValues.Choice2.StringValue + ret.Choice2_Votes = int_or_panic(apiCard.BindingValues.Choice2_Count.StringValue) + + if ret.NumChoices > 2 { + ret.Choice3 = apiCard.BindingValues.Choice3.StringValue + ret.Choice3_Votes = int_or_panic(apiCard.BindingValues.Choice3_Count.StringValue) + } + if ret.NumChoices > 3 { + ret.Choice4 = apiCard.BindingValues.Choice4.StringValue + ret.Choice4_Votes = int_or_panic(apiCard.BindingValues.Choice4_Count.StringValue) + } + + return ret +} + +func parse_num_choices(card_name string) int { + if strings.Index(card_name, "poll") != 0 || strings.Index(card_name, "choice") != 5 { + panic("Not valid card name: " + card_name) + } + + return int_or_panic(card_name[4:5]) +} + +func ParseAPIVideo(apiVideo APIExtendedMedia) Video { + variants := apiVideo.VideoInfo.Variants + sort.Sort(variants) + video_remote_url := variants[0].URL + + var view_count int + + r := apiVideo.Ext.MediaStats.R + + switch r.(type) { + case string: + view_count = 0 + case map[string]interface{}: + OK_entry, ok := r.(map[string]interface{})["ok"] + if !ok { + panic("No 'ok' value found in the R!") + } + view_count_str, ok := OK_entry.(map[string]interface{})["viewCount"] + view_count = int_or_panic(view_count_str.(string)) + if !ok { + panic("No 'viewCount' value found in the OK!") + } + } + + video_parsed_url, err := url.Parse(video_remote_url) + if err != nil { + panic(err) + } + + local_filename := get_prefixed_path(path.Base(video_parsed_url.Path)) + + return Video{ + ID: VideoID(apiVideo.ID), + Width: apiVideo.OriginalInfo.Width, + Height: apiVideo.OriginalInfo.Height, + RemoteURL: video_remote_url, + LocalFilename: local_filename, + + ThumbnailRemoteUrl: apiVideo.MediaURLHttps, + ThumbnailLocalPath: get_prefixed_path(path.Base(apiVideo.MediaURLHttps)), + Duration: apiVideo.VideoInfo.Duration, + ViewCount: view_count, + + IsDownloaded: false, + IsBlockedByDMCA: false, + IsGeoblocked: apiVideo.ExtMediaAvailability.Reason == "Geoblocked", + IsGif: apiVideo.Type == "animated_gif", + } +} + +func ParseAPIUrlCard(apiCard APICard) Url { + values := apiCard.BindingValues + ret := Url{} + ret.HasCard = true + + ret.Domain = values.Domain.Value + ret.Title = values.Title.Value + ret.Description = values.Description.Value + ret.IsContentDownloaded = false + ret.CreatorID = UserID(values.Creator.UserValue.Value) + ret.SiteID = UserID(values.Site.UserValue.Value) + + var thumbnail_url string + + if apiCard.Name == "summary_large_image" || apiCard.Name == "summary" { + thumbnail_url = values.Thumbnail.ImageValue.Url + } else if apiCard.Name == "player" { + thumbnail_url = values.PlayerImage.ImageValue.Url + } else if apiCard.Name == "unified_card" { + // TODO: Grok chat previews + log.Print("Grok chat card, not implemented yet-- skipping") + } else { + panic("Unknown card type: " + apiCard.Name) + } + + if thumbnail_url != "" { + ret.HasThumbnail = true + ret.ThumbnailRemoteUrl = thumbnail_url + ret.ThumbnailLocalPath = get_thumbnail_local_path(thumbnail_url) + ret.ThumbnailWidth = values.Thumbnail.ImageValue.Width + ret.ThumbnailHeight = values.Thumbnail.ImageValue.Height + } + + return ret +} + +func get_prefixed_path(p string) string { + local_prefix_regex := regexp.MustCompile(`^[\w-]{2}`) + local_prefix := local_prefix_regex.FindString(p) + if len(local_prefix) != 2 { + panic(fmt.Sprintf("Unable to extract a 2-letter prefix for filename %s", p)) + } + return path.Join(local_prefix, p) +} + +func get_thumbnail_local_path(remote_url string) string { + u, err := url.Parse(remote_url) + if err != nil { + panic(err) + } + if u.RawQuery == "" { + return path.Base(u.Path) + } + query_params, err := url.ParseQuery(u.RawQuery) + if err != nil { + panic(err) + } + + return get_prefixed_path( + fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0]), + ) +} + type APITweet struct { ID int64 `json:"id_str,string"` ConversationID int64 `json:"conversation_id_str,string"` @@ -184,6 +359,171 @@ type APITweet struct { IsExpandable bool } +func (t APITweet) ToTweetTrove() (TweetTrove, error) { + ret := NewTweetTrove() + if t.RetweetedStatusIDStr == "" { + // Parse as a Tweet + new_tweet, err := ParseSingleTweet(t) + if err != nil { + return ret, err + } + ret.Tweets[new_tweet.ID] = new_tweet + for _, space := range new_tweet.Spaces { + ret.Spaces[space.ID] = space + } + } else { + // Parse as a Retweet + new_retweet := Retweet{} + var err error + + t.NormalizeContent() + + new_retweet.RetweetID = TweetID(t.ID) + new_retweet.TweetID = TweetID(t.RetweetedStatusID) + new_retweet.RetweetedByID = UserID(t.UserID) + new_retweet.RetweetedAt, err = TimestampFromString(t.CreatedAt) + if err != nil { + return ret, err + } + ret.Retweets[new_retweet.RetweetID] = new_retweet + } + return ret, nil +} + +// Turn an APITweet, as returned from the scraper, into a properly structured Tweet object +func ParseSingleTweet(t APITweet) (ret Tweet, err error) { + t.NormalizeContent() + + ret.ID = TweetID(t.ID) + ret.UserID = UserID(t.UserID) + ret.UserHandle = UserHandle(t.UserHandle) + ret.Text = t.FullText + ret.IsExpandable = t.IsExpandable + + // Process "posted-at" date and time + if t.TombstoneText == "" { // Skip time parsing for tombstones + ret.PostedAt, err = TimestampFromString(t.CreatedAt) + if err != nil { + if ret.ID == 0 { + return Tweet{}, fmt.Errorf("unable to parse tweet: %w", ERR_NO_TWEET) + } + return Tweet{}, fmt.Errorf("Error parsing time on tweet ID %d:\n %w", ret.ID, err) + } + } + + ret.NumLikes = t.FavoriteCount + ret.NumRetweets = t.RetweetCount + ret.NumReplies = t.ReplyCount + ret.NumQuoteTweets = t.QuoteCount + ret.InReplyToID = TweetID(t.InReplyToStatusID) + ret.QuotedTweetID = TweetID(t.QuotedStatusID) + + // Process URLs and link previews + for _, url := range t.Entities.URLs { + var url_object Url + if t.Card.ShortenedUrl == url.ShortenedUrl { + if t.Card.Name == "3691233323:audiospace" { + // This "url" is just a link to a Space. Don't process it as a Url + continue + } + url_object = ParseAPIUrlCard(t.Card) + } + url_object.Text = url.ExpandedURL + url_object.ShortText = url.ShortenedUrl + url_object.TweetID = ret.ID + + // Skip it if it's just the quoted tweet + _, id, is_ok := TryParseTweetUrl(url.ExpandedURL) + if is_ok && id == ret.QuotedTweetID { + continue + } + + ret.Urls = append(ret.Urls, url_object) + } + + // Process images + for _, media := range t.Entities.Media { + if media.Type != "photo" { + // Videos now have an entry in "Entities.Media" but they can be ignored; the useful bit is in ExtendedEntities + // So skip ones that aren't "photo" + continue + } + new_image := ParseAPIMedia(media) + new_image.TweetID = ret.ID + ret.Images = append(ret.Images, new_image) + } + + // Process hashtags + for _, hashtag := range t.Entities.Hashtags { + ret.Hashtags = append(ret.Hashtags, hashtag.Text) + } + + // Process `@` mentions and reply-mentions + for _, mention := range t.Entities.Mentions { + ret.Mentions = append(ret.Mentions, mention.UserName) + } + for _, mention := range strings.Split(t.Entities.ReplyMentions, " ") { + if mention != "" { + if mention[0] != '@' { + panic(fmt.Errorf("Unknown ReplyMention value %q:\n %w", t.Entities.ReplyMentions, EXTERNAL_API_ERROR)) + } + ret.ReplyMentions = append(ret.ReplyMentions, mention[1:]) + } + } + + // Process videos + for _, entity := range t.ExtendedEntities.Media { + if entity.Type != "video" && entity.Type != "animated_gif" { + continue + } + + new_video := ParseAPIVideo(entity) + new_video.TweetID = ret.ID + ret.Videos = append(ret.Videos, new_video) + + // Remove the thumbnail from the Images list + updated_imgs := []Image{} + for _, img := range ret.Images { + if VideoID(img.ID) != new_video.ID { + updated_imgs = append(updated_imgs, img) + } + } + ret.Images = updated_imgs + } + + // Process polls + if strings.Index(t.Card.Name, "poll") == 0 { + poll := ParseAPIPoll(t.Card) + poll.TweetID = ret.ID + ret.Polls = []Poll{poll} + } + + // Process spaces + if t.Card.Name == "3691233323:audiospace" { + space := Space{} + space.ID = SpaceID(t.Card.BindingValues.ID.StringValue) + space.ShortUrl = t.Card.ShortenedUrl + + // Indicate that this Space needs its details fetched still + space.IsDetailsFetched = false + + ret.Spaces = []Space{space} + ret.SpaceID = space.ID + } + + // Process tombstones and other metadata + ret.TombstoneType = t.TombstoneText + ret.IsStub = !(ret.TombstoneType == "") + ret.LastScrapedAt = TimestampFromUnix(0) // Caller will change this for the tweet that was actually scraped + ret.IsConversationScraped = false // Safe due to the "No Worsening" principle + + // Extra data that can help piece together tombstoned tweet info + ret.in_reply_to_user_id = UserID(t.InReplyToUserID) + ret.in_reply_to_user_handle = UserHandle(t.InReplyToScreenName) + + return +} + func (t *APITweet) NormalizeContent() { id, err := strconv.Atoi(t.QuotedStatusIDStr) if err == nil { @@ -260,6 +600,54 @@ type APIUser struct { DoesntExist bool } +// Turn an APIUser, as returned from the scraper, into a properly structured User object +func ParseSingleUser(apiUser APIUser) (ret User, err error) { + if apiUser.DoesntExist { + // User may have been deleted, or there was a typo. There's no data to parse + if apiUser.ScreenName == "" { + panic("ScreenName is empty!") + } + ret = GetUnknownUserWithHandle(UserHandle(apiUser.ScreenName)) + return + } + ret.ID = UserID(apiUser.ID) + ret.Handle = UserHandle(apiUser.ScreenName) + if apiUser.IsBanned { + // Banned users won't have any further info, so just return here + ret.IsBanned = true + return + } + ret.DisplayName = apiUser.Name + ret.Bio = apiUser.Description + ret.FollowingCount = apiUser.FriendsCount + ret.FollowersCount = apiUser.FollowersCount + ret.Location = apiUser.Location + if len(apiUser.Entities.URL.Urls) > 0 { + ret.Website = apiUser.Entities.URL.Urls[0].ExpandedURL + } + ret.JoinDate, err = TimestampFromString(apiUser.CreatedAt) + if err != nil { + err = fmt.Errorf("Error parsing time on user ID %d: %w", ret.ID, err) + return + } + ret.IsPrivate = apiUser.Protected + ret.IsVerified = apiUser.Verified + ret.ProfileImageUrl = apiUser.ProfileImageURLHTTPS + + if regexp.MustCompile(`_normal\.\w{2,4}`).MatchString(ret.ProfileImageUrl) { + ret.ProfileImageUrl = strings.ReplaceAll(ret.ProfileImageUrl, "_normal.", ".") + } + ret.BannerImageUrl = apiUser.ProfileBannerURL + + ret.ProfileImageLocalPath = ret.compute_profile_image_local_path() + ret.BannerImageLocalPath = ret.compute_banner_image_local_path() + + if len(apiUser.PinnedTweetIdsStr) > 0 { + ret.PinnedTweetID = TweetID(idstr_to_int(apiUser.PinnedTweetIdsStr[0])) + } + return +} + type APINotification struct { ID string `json:"id"` TimestampMs int64 `json:"timestampMs,string"` @@ -565,22 +953,11 @@ func (t *TweetResponse) ToTweetTrove() (TweetTrove, error) { ret := NewTweetTrove() for _, single_tweet := range t.GlobalObjects.Tweets { - if single_tweet.RetweetedStatusIDStr == "" { - new_tweet, err := ParseSingleTweet(single_tweet) - if err != nil { - return ret, err - } - ret.Tweets[new_tweet.ID] = new_tweet - for _, space := range new_tweet.Spaces { - ret.Spaces[space.ID] = space - } - } else { - new_retweet, err := ParseSingleRetweet(single_tweet) - if err != nil { - return ret, err - } - ret.Retweets[new_retweet.RetweetID] = new_retweet + trove, err := single_tweet.ToTweetTrove() + if err != nil { + return ret, err } + ret.MergeWith(trove) } for _, user := range t.GlobalObjects.Users { @@ -597,10 +974,14 @@ func (t *TweetResponse) ToTweetTrove() (TweetTrove, error) { return ret, nil } -func idstr_to_int(idstr string) int64 { - id, err := strconv.Atoi(idstr) +func idstr_to_int(s string) int64 { + return int64(int_or_panic(s)) +} + +func int_or_panic(s string) int { + result, err := strconv.Atoi(s) if err != nil { panic(err) } - return int64(id) + return result } diff --git a/pkg/scraper/api_types_dms.go b/pkg/scraper/api_types_dms.go index 2d5677c..0303de9 100644 --- a/pkg/scraper/api_types_dms.go +++ b/pkg/scraper/api_types_dms.go @@ -5,6 +5,7 @@ import ( "fmt" "html" "net/url" + "path" "strings" "github.com/google/uuid" @@ -77,38 +78,106 @@ func (m *APIDMMessage) NormalizeContent() { m.MessageData.Text = strings.TrimSpace(m.MessageData.Text) } -func (m APIDMMessage) ToTweetTrove() TweetTrove { +func (api_msg APIDMMessage) ToTweetTrove() TweetTrove { ret := NewTweetTrove() - if m.ID == 0 { + if api_msg.ID == 0 { return ret } - m.NormalizeContent() - result := ParseAPIDMMessage(m) + api_msg.NormalizeContent() + + msg := DMMessage{} + msg.ID = DMMessageID(api_msg.ID) + msg.SentAt = TimestampFromUnixMilli(int64(api_msg.Time)) + msg.DMChatRoomID = DMChatRoomID(api_msg.ConversationID) + msg.SenderID = UserID(api_msg.MessageData.SenderID) + msg.Text = api_msg.MessageData.Text + + msg.InReplyToID = DMMessageID(api_msg.MessageData.ReplyData.ID) // Will be "0" if not a reply + + msg.Reactions = make(map[UserID]DMReaction) + for _, api_reacc := range api_msg.MessageReactions { + reacc := DMReaction{} + reacc.ID = DMMessageID(api_reacc.ID) + reacc.SenderID = UserID(api_reacc.SenderID) + reacc.SentAt = TimestampFromUnixMilli(int64(api_reacc.Time)) + reacc.Emoji = api_reacc.Emoji + reacc.DMMessageID = msg.ID + msg.Reactions[reacc.SenderID] = reacc + } + if api_msg.MessageData.Attachment.Photo.ID != 0 { + new_image := ParseAPIMedia(api_msg.MessageData.Attachment.Photo) + new_image.DMMessageID = msg.ID + msg.Images = []Image{new_image} + } + if api_msg.MessageData.Attachment.Video.ID != 0 { + entity := api_msg.MessageData.Attachment.Video + if entity.Type == "video" || entity.Type == "animated_gif" { + new_video := ParseAPIVideo(entity) + new_video.DMMessageID = msg.ID + msg.Videos = append(msg.Videos, new_video) + } + } + + // Process URLs and link previews + for _, url := range api_msg.MessageData.Entities.URLs { + // Skip it if it's an embedded tweet + _, id, is_ok := TryParseTweetUrl(url.ExpandedURL) + if is_ok && id == TweetID(api_msg.MessageData.Attachment.Tweet.Status.ID) { + continue + } + // Skip it if it's an embedded image + if api_msg.MessageData.Attachment.Photo.URL == url.ShortenedUrl { + continue + } + // Skip it if it's an embedded video + if api_msg.MessageData.Attachment.Video.URL == url.ShortenedUrl { + continue + } + + var new_url Url + if api_msg.MessageData.Attachment.Card.ShortenedUrl == url.ShortenedUrl { + if api_msg.MessageData.Attachment.Card.Name == "3691233323:audiospace" { + // This "url" is just a link to a Space. Don't process it as a Url + // TODO: ...but do process it as a Space? + continue + } + new_url = ParseAPIUrlCard(api_msg.MessageData.Attachment.Card) + } + new_url.Text = url.ExpandedURL + new_url.ShortText = url.ShortenedUrl + new_url.DMMessageID = msg.ID + msg.Urls = append(msg.Urls, new_url) + } // Parse tweet attachment - if m.MessageData.Attachment.Tweet.Status.ID != 0 { - u, err := ParseSingleUser(m.MessageData.Attachment.Tweet.Status.User) + if api_msg.MessageData.Attachment.Tweet.Status.ID != 0 { + u, err := ParseSingleUser(api_msg.MessageData.Attachment.Tweet.Status.User) if err != nil { panic(err) } ret.Users[u.ID] = u - t, err := ParseSingleTweet(m.MessageData.Attachment.Tweet.Status.APITweet) + t, err := ParseSingleTweet(api_msg.MessageData.Attachment.Tweet.Status.APITweet) if err != nil { panic(err) } t.UserID = u.ID ret.Tweets[t.ID] = t - result.EmbeddedTweetID = t.ID + msg.EmbeddedTweetID = t.ID } - ret.Messages[result.ID] = result - - // TODO: parse attached images and videos + ret.Messages[msg.ID] = msg return ret } +type APIDMResponse struct { + InboxInitialState APIInbox `json:"inbox_initial_state"` + InboxTimeline APIInbox `json:"inbox_timeline"` + ConversationTimeline APIInbox `json:"conversation_timeline"` + UserEvents APIInbox `json:"user_events"` +} + type APIDMConversation struct { ConversationID string `json:"conversation_id"` Type string `json:"type"` @@ -179,13 +248,6 @@ type APIInbox struct { Conversations map[string]APIDMConversation `json:"conversations"` } -type APIDMResponse struct { - InboxInitialState APIInbox `json:"inbox_initial_state"` - InboxTimeline APIInbox `json:"inbox_timeline"` - ConversationTimeline APIInbox `json:"conversation_timeline"` - UserEvents APIInbox `json:"user_events"` -} - func (r APIInbox) ToTweetTrove(current_user_id UserID) TweetTrove { ret := NewTweetTrove() @@ -211,8 +273,8 @@ func (r APIInbox) ToTweetTrove(current_user_id UserID) TweetTrove { ret.MergeWith(entry.Message.ToTweetTrove()) } - for _, room := range r.Conversations { - result := ParseAPIDMChatRoom(room, current_user_id) + for _, api_room := range r.Conversations { + result := ParseAPIDMChatRoom(api_room, current_user_id) ret.Rooms[result.ID] = result } for _, u := range r.Users { @@ -225,6 +287,46 @@ func (r APIInbox) ToTweetTrove(current_user_id UserID) TweetTrove { return ret } +func ParseAPIDMChatRoom(api_room APIDMConversation, current_user_id UserID) DMChatRoom { + result := DMChatRoom{} + result.ID = DMChatRoomID(api_room.ConversationID) + result.Type = api_room.Type + result.LastMessagedAt = TimestampFromUnixMilli(int64(api_room.SortTimestamp)) + result.IsNSFW = api_room.NSFW + + if result.Type == "GROUP_DM" { + result.CreatedAt = TimestampFromUnixMilli(int64(api_room.CreateTime)) + result.CreatedByUserID = UserID(api_room.CreatedByUserID) + result.Name = api_room.Name + result.AvatarImageRemoteURL = api_room.AvatarImage + tmp_url, err := url.Parse(result.AvatarImageRemoteURL) + if err != nil { + panic(err) + } + result.AvatarImageLocalPath = fmt.Sprintf("%s_avatar_%s.%s", result.ID, path.Base(tmp_url.Path), tmp_url.Query().Get("format")) + } + + result.Participants = make(map[UserID]DMChatParticipant) + for _, api_participant := range api_room.Participants { + participant := DMChatParticipant{} + participant.UserID = UserID(api_participant.UserID) + participant.DMChatRoomID = result.ID + participant.LastReadEventID = DMMessageID(api_participant.LastReadEventID) + + // Process chat settings if this is the logged-in user + if participant.UserID == current_user_id { + participant.IsNotificationsDisabled = api_room.NotificationsDisabled + participant.IsReadOnly = api_room.ReadOnly + participant.IsTrusted = api_room.Trusted + participant.IsMuted = api_room.Muted + participant.Status = api_room.Status + participant.IsChatSettingsValid = true + } + result.Participants[participant.UserID] = participant + } + return result +} + func (api *API) GetDMInbox() (APIInbox, error) { url, err := url.Parse("https://twitter.com/i/api/1.1/dm/inbox_initial_state.json") if err != nil { @@ -284,6 +386,30 @@ func (api *API) GetDMInbox() (APIInbox, error) { result.InboxInitialState.Status = result.InboxInitialState.InboxTimelines.Trusted.Status return result.InboxInitialState, err } +func (api *API) GetInbox(how_many int) (TweetTrove, string, error) { + if !api.IsAuthenticated { + return TweetTrove{}, "", ErrLoginRequired + } + dm_response, err := api.GetDMInbox() + if err != nil { + panic(err) + } + + trove := dm_response.ToTweetTrove(api.UserID) + cursor := dm_response.Cursor + next_cursor_id := dm_response.InboxTimelines.Trusted.MinEntryID + for len(trove.Rooms) < how_many && dm_response.Status != "AT_END" { + dm_response, err = api.GetInboxTrusted(next_cursor_id) + if err != nil { + panic(err) + } + next_trove := dm_response.ToTweetTrove(api.UserID) + next_cursor_id = dm_response.MinEntryID + trove.MergeWith(next_trove) + } + + return trove, cursor, nil +} func (api *API) GetInboxTrusted(oldest_id int) (APIInbox, error) { url, err := url.Parse("https://twitter.com/i/api/1.1/dm/inbox_timeline/trusted.json") @@ -345,62 +471,87 @@ func (api *API) GetInboxTrusted(oldest_id int) (APIInbox, error) { return result.InboxTimeline, err } -func (api *API) GetDMConversation(id DMChatRoomID, max_id DMMessageID) (APIInbox, error) { - url, err := url.Parse("https://twitter.com/i/api/1.1/dm/conversation/" + string(id) + ".json") +func (api *API) GetConversation(room_id DMChatRoomID, max_id DMMessageID, how_many int) (TweetTrove, error) { + if !api.IsAuthenticated { + return TweetTrove{}, ErrLoginRequired + } + + fetch := func(max_id DMMessageID) (APIInbox, error) { + url, err := url.Parse("https://twitter.com/i/api/1.1/dm/conversation/" + string(room_id) + ".json") + if err != nil { + panic(err) + } + query := url.Query() + query.Add("max_id", fmt.Sprint(max_id)) + query.Add("context", "FETCH_DM_CONVERSATION_HISTORY") + query.Add("include_profile_interstitial_type", "1") + query.Add("include_blocking", "1") + query.Add("include_blocked_by", "1") + query.Add("include_followed_by", "1") + query.Add("include_want_retweets", "1") + query.Add("include_mute_edge", "1") + query.Add("include_can_dm", "1") + query.Add("include_can_media_tag", "1") + query.Add("include_ext_has_nft_avatar", "1") + query.Add("include_ext_is_blue_verified", "1") + query.Add("include_ext_verified_type", "1") + query.Add("include_ext_profile_image_shape", "1") + query.Add("skip_status", "1") + query.Add("dm_secret_conversations_enabled", "false") + query.Add("krs_registration_enabled", "true") + query.Add("cards_platform", "Web-12") + query.Add("include_cards", "1") + query.Add("include_ext_alt_text", "true") + query.Add("include_ext_limited_action_results", "true") + query.Add("include_quote_count", "true") + query.Add("include_reply_count", "1") + query.Add("tweet_mode", "extended") + query.Add("include_ext_views", "true") + query.Add("dm_users", "false") + query.Add("include_groups", "true") + query.Add("include_inbox_timelines", "true") + query.Add("include_ext_media_color", "true") + query.Add("supports_reactions", "true") + query.Add("include_conversation_info", "true") + query.Add("ext", strings.Join([]string{ + "mediaColor", + "altText", + "mediaStats", + "highlightedLabel", + "hasNftAvatar", + "voiceInfo", + "birdwatchPivot", + "enrichments", + "superFollowMetadata", + "unmentionInfo", + "editControl", + "vibe", + }, ",")) + url.RawQuery = query.Encode() + + var result APIDMResponse + err = api.do_http(url.String(), "", &result) + return result.ConversationTimeline, err + } + + dm_response, err := fetch(max_id) if err != nil { panic(err) } - query := url.Query() - query.Add("max_id", fmt.Sprint(max_id)) - query.Add("context", "FETCH_DM_CONVERSATION_HISTORY") - query.Add("include_profile_interstitial_type", "1") - query.Add("include_blocking", "1") - query.Add("include_blocked_by", "1") - query.Add("include_followed_by", "1") - query.Add("include_want_retweets", "1") - query.Add("include_mute_edge", "1") - query.Add("include_can_dm", "1") - query.Add("include_can_media_tag", "1") - query.Add("include_ext_has_nft_avatar", "1") - query.Add("include_ext_is_blue_verified", "1") - query.Add("include_ext_verified_type", "1") - query.Add("include_ext_profile_image_shape", "1") - query.Add("skip_status", "1") - query.Add("dm_secret_conversations_enabled", "false") - query.Add("krs_registration_enabled", "true") - query.Add("cards_platform", "Web-12") - query.Add("include_cards", "1") - query.Add("include_ext_alt_text", "true") - query.Add("include_ext_limited_action_results", "true") - query.Add("include_quote_count", "true") - query.Add("include_reply_count", "1") - query.Add("tweet_mode", "extended") - query.Add("include_ext_views", "true") - query.Add("dm_users", "false") - query.Add("include_groups", "true") - query.Add("include_inbox_timelines", "true") - query.Add("include_ext_media_color", "true") - query.Add("supports_reactions", "true") - query.Add("include_conversation_info", "true") - query.Add("ext", strings.Join([]string{ - "mediaColor", - "altText", - "mediaStats", - "highlightedLabel", - "hasNftAvatar", - "voiceInfo", - "birdwatchPivot", - "enrichments", - "superFollowMetadata", - "unmentionInfo", - "editControl", - "vibe", - }, ",")) - url.RawQuery = query.Encode() - var result APIDMResponse - err = api.do_http(url.String(), "", &result) - return result.ConversationTimeline, err + trove := dm_response.ToTweetTrove(api.UserID) + oldest := trove.GetOldestMessage(room_id) + for len(trove.Messages) < how_many && dm_response.Status != "AT_END" { + dm_response, err = fetch(oldest) + if err != nil { + panic(err) + } + next_trove := dm_response.ToTweetTrove(api.UserID) + oldest = next_trove.GetOldestMessage(room_id) + trove.MergeWith(next_trove) + } + + return trove, nil } // Returns a TweetTrove and the cursor for the next update, or an error @@ -459,6 +610,9 @@ func (api *API) PollInboxUpdates(cursor string) (TweetTrove, string, error) { return result.UserEvents.ToTweetTrove(api.UserID), result.UserEvents.Cursor, nil } +// Writes +// ------ + func (api *API) SendDMMessage(room_id DMChatRoomID, text string, in_reply_to_id DMMessageID) (TweetTrove, error) { if !api.IsAuthenticated { return TweetTrove{}, ErrLoginRequired diff --git a/pkg/scraper/api_types_dms_test.go b/pkg/scraper/api_types_dms_test.go index 637af86..22dbb85 100644 --- a/pkg/scraper/api_types_dms_test.go +++ b/pkg/scraper/api_types_dms_test.go @@ -21,7 +21,9 @@ func TestParseAPIDMMessage(t *testing.T) { err = json.Unmarshal(data, &api_message) require.NoError(t, err) - message := ParseAPIDMMessage(api_message) + trove := api_message.ToTweetTrove() + message, is_ok := trove.Messages[DMMessageID(api_message.ID)] + require.True(t, is_ok) assert.Equal(message.ID, DMMessageID(1663623203644751885)) assert.Equal(message.SentAt, TimestampFromUnixMilli(1685473655064)) assert.Equal(message.DMChatRoomID, DMChatRoomID("1458284524761075714-1488963321701171204")) @@ -41,7 +43,9 @@ func TestParseAPIDMMessageWithReaction(t *testing.T) { err = json.Unmarshal(data, &api_message) require.NoError(t, err) - message := ParseAPIDMMessage(api_message) + trove := api_message.ToTweetTrove() + message, is_ok := trove.Messages[DMMessageID(api_message.ID)] + require.True(t, is_ok) assert.Equal(message.ID, DMMessageID(1663623062195957773)) require.Len(t, message.Reactions, 1) diff --git a/pkg/scraper/api_types_v2.go b/pkg/scraper/api_types_v2.go index f6f32d3..99223bd 100644 --- a/pkg/scraper/api_types_v2.go +++ b/pkg/scraper/api_types_v2.go @@ -1390,7 +1390,19 @@ func (api API) GetUser(handle UserHandle) (User, error) { return ParseSingleUser(apiUser) } +// Calls API#GetUserByID and returns the parsed result +func GetUserByID(u_id UserID) (User, error) { + session, err := NewGuestSession() // This endpoint works better if you're not logged in + if err != nil { + return User{}, err + } + return session.GetUserByID(u_id) +} + func (api API) GetUserByID(u_id UserID) (User, error) { + if u_id == UserID(0) { + panic("No Users with ID 0") + } url, err := url.Parse(GraphqlURL{ BaseUrl: "https://x.com/i/api/graphql/Qw77dDjp9xCpUY-AXwt-yQ/UserByRestId", Variables: GraphqlVariables{ diff --git a/pkg/scraper/constants.go b/pkg/scraper/constants.go deleted file mode 100644 index f8842e7..0000000 --- a/pkg/scraper/constants.go +++ /dev/null @@ -1,6 +0,0 @@ -package scraper - -// Tokens -// ------ - -const BEARER_TOKEN string = "AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" diff --git a/pkg/scraper/conversation.go b/pkg/scraper/conversation.go deleted file mode 100644 index 4d705d1..0000000 --- a/pkg/scraper/conversation.go +++ /dev/null @@ -1,18 +0,0 @@ -package scraper - -type ConversationID string - -type Conversation struct { - ID ConversationID - Type string - SortEventID int - SortTimestamp int - Participants []User - Nsfw bool - NotificationsDisabled bool - LastReadEventId int - ReadOnly bool - Trusted bool - LowQuality bool - Muted bool -} diff --git a/pkg/scraper/dm_chat_room.go b/pkg/scraper/dm_chat_room.go index a66bfec..5a6f552 100644 --- a/pkg/scraper/dm_chat_room.go +++ b/pkg/scraper/dm_chat_room.go @@ -1,11 +1,5 @@ package scraper -import ( - "fmt" - "net/url" - "path" -) - type DMChatRoomID string // A participant in a chat room. @@ -45,6 +39,8 @@ type DMChatRoom struct { Participants map[UserID]DMChatParticipant } +// TODO: view-layer +// - view helpers should go in a view layer func (r DMChatRoom) GetParticipantIDs() []UserID { ret := []UserID{} for user_id := range r.Participants { @@ -52,43 +48,3 @@ func (r DMChatRoom) GetParticipantIDs() []UserID { } return ret } - -func ParseAPIDMChatRoom(api_room APIDMConversation, current_user_id UserID) DMChatRoom { - ret := DMChatRoom{} - ret.ID = DMChatRoomID(api_room.ConversationID) - ret.Type = api_room.Type - ret.LastMessagedAt = TimestampFromUnixMilli(int64(api_room.SortTimestamp)) - ret.IsNSFW = api_room.NSFW - - if ret.Type == "GROUP_DM" { - ret.CreatedAt = TimestampFromUnixMilli(int64(api_room.CreateTime)) - ret.CreatedByUserID = UserID(api_room.CreatedByUserID) - ret.Name = api_room.Name - ret.AvatarImageRemoteURL = api_room.AvatarImage - tmp_url, err := url.Parse(ret.AvatarImageRemoteURL) - if err != nil { - panic(err) - } - ret.AvatarImageLocalPath = fmt.Sprintf("%s_avatar_%s.%s", ret.ID, path.Base(tmp_url.Path), tmp_url.Query().Get("format")) - } - - ret.Participants = make(map[UserID]DMChatParticipant) - for _, api_participant := range api_room.Participants { - participant := DMChatParticipant{} - participant.UserID = UserID(api_participant.UserID) - participant.DMChatRoomID = ret.ID - participant.LastReadEventID = DMMessageID(api_participant.LastReadEventID) - - // Process chat settings if this is the logged-in user - if participant.UserID == current_user_id { - participant.IsNotificationsDisabled = api_room.NotificationsDisabled - participant.IsReadOnly = api_room.ReadOnly - participant.IsTrusted = api_room.Trusted - participant.IsMuted = api_room.Muted - participant.Status = api_room.Status - participant.IsChatSettingsValid = true - } - ret.Participants[participant.UserID] = participant - } - return ret -} diff --git a/pkg/scraper/dm_message.go b/pkg/scraper/dm_message.go index 23ab2cf..cd566f8 100644 --- a/pkg/scraper/dm_message.go +++ b/pkg/scraper/dm_message.go @@ -10,15 +10,6 @@ type DMReaction struct { Emoji string `db:"emoji"` } -func ParseAPIDMReaction(reacc APIDMReaction) DMReaction { - ret := DMReaction{} - ret.ID = DMMessageID(reacc.ID) - ret.SenderID = UserID(reacc.SenderID) - ret.SentAt = TimestampFromUnixMilli(int64(reacc.Time)) - ret.Emoji = reacc.Emoji - return ret -} - type DMMessage struct { ID DMMessageID `db:"id"` DMChatRoomID DMChatRoomID `db:"chat_room_id"` @@ -33,67 +24,6 @@ type DMMessage struct { Images []Image Videos []Video Urls []Url -} - -func ParseAPIDMMessage(message APIDMMessage) DMMessage { - ret := DMMessage{} - ret.ID = DMMessageID(message.ID) - ret.SentAt = TimestampFromUnixMilli(int64(message.Time)) - ret.DMChatRoomID = DMChatRoomID(message.ConversationID) - ret.SenderID = UserID(message.MessageData.SenderID) - ret.Text = message.MessageData.Text - - ret.InReplyToID = DMMessageID(message.MessageData.ReplyData.ID) // Will be "0" if not a reply - - ret.Reactions = make(map[UserID]DMReaction) - for _, api_reacc := range message.MessageReactions { - reacc := ParseAPIDMReaction(api_reacc) - reacc.DMMessageID = ret.ID - ret.Reactions[reacc.SenderID] = reacc - } - if message.MessageData.Attachment.Photo.ID != 0 { - new_image := ParseAPIMedia(message.MessageData.Attachment.Photo) - new_image.DMMessageID = ret.ID - ret.Images = []Image{new_image} - } - if message.MessageData.Attachment.Video.ID != 0 { - entity := message.MessageData.Attachment.Video - if entity.Type == "video" || entity.Type == "animated_gif" { - new_video := ParseAPIVideo(entity) - new_video.DMMessageID = ret.ID - ret.Videos = append(ret.Videos, new_video) - } - } - - // Process URLs and link previews - for _, url := range message.MessageData.Entities.URLs { - // Skip it if it's an embedded tweet - _, id, is_ok := TryParseTweetUrl(url.ExpandedURL) - if is_ok && id == TweetID(message.MessageData.Attachment.Tweet.Status.ID) { - continue - } - // Skip it if it's an embedded image - if message.MessageData.Attachment.Photo.URL == url.ShortenedUrl { - continue - } - // Skip it if it's an embedded video - if message.MessageData.Attachment.Video.URL == url.ShortenedUrl { - continue - } - - var new_url Url - if message.MessageData.Attachment.Card.ShortenedUrl == url.ShortenedUrl { - if message.MessageData.Attachment.Card.Name == "3691233323:audiospace" { - // This "url" is just a link to a Space. Don't process it as a Url - continue - } - new_url = ParseAPIUrlCard(message.MessageData.Attachment.Card) - } - new_url.Text = url.ExpandedURL - new_url.ShortText = url.ShortenedUrl - new_url.DMMessageID = ret.ID - ret.Urls = append(ret.Urls, new_url) - } - - return ret + + LastReadEventUserIDs []UserID // Used for rendering } diff --git a/pkg/scraper/dm_trove.go b/pkg/scraper/dm_trove.go deleted file mode 100644 index 5aaa8e9..0000000 --- a/pkg/scraper/dm_trove.go +++ /dev/null @@ -1,63 +0,0 @@ -package scraper - -func (t TweetTrove) GetOldestMessage(id DMChatRoomID) DMMessageID { - oldest := DMMessageID(^uint(0) >> 1) // Max integer - for _, m := range t.Messages { - if m.ID < oldest && m.DMChatRoomID == id { - oldest = m.ID - } - } - return oldest -} - -// TODO: Why are these all here? => - -// Returns a TweetTrove and the cursor for the next update -func (api *API) GetInbox(how_many int) (TweetTrove, string, error) { - if !api.IsAuthenticated { - return TweetTrove{}, "", ErrLoginRequired - } - dm_response, err := api.GetDMInbox() - if err != nil { - panic(err) - } - - trove := dm_response.ToTweetTrove(api.UserID) - cursor := dm_response.Cursor - next_cursor_id := dm_response.InboxTimelines.Trusted.MinEntryID - for len(trove.Rooms) < how_many && dm_response.Status != "AT_END" { - dm_response, err = api.GetInboxTrusted(next_cursor_id) - if err != nil { - panic(err) - } - next_trove := dm_response.ToTweetTrove(api.UserID) - next_cursor_id = dm_response.MinEntryID - trove.MergeWith(next_trove) - } - - return trove, cursor, nil -} - -func (api *API) GetConversation(id DMChatRoomID, max_id DMMessageID, how_many int) (TweetTrove, error) { - if !api.IsAuthenticated { - return TweetTrove{}, ErrLoginRequired - } - dm_response, err := api.GetDMConversation(id, max_id) - if err != nil { - panic(err) - } - - trove := dm_response.ToTweetTrove(api.UserID) - oldest := trove.GetOldestMessage(id) - for len(trove.Messages) < how_many && dm_response.Status != "AT_END" { - dm_response, err = api.GetDMConversation(id, oldest) - if err != nil { - panic(err) - } - next_trove := dm_response.ToTweetTrove(api.UserID) - oldest = next_trove.GetOldestMessage(id) - trove.MergeWith(next_trove) - } - - return trove, nil -} diff --git a/pkg/scraper/guest_token.go b/pkg/scraper/guest_token.go index 2a7c243..d8b8872 100644 --- a/pkg/scraper/guest_token.go +++ b/pkg/scraper/guest_token.go @@ -11,6 +11,8 @@ import ( "time" ) +const BEARER_TOKEN string = "AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" + type GuestTokenResponse struct { Token string `json:"guest_token"` RefreshedAt time.Time diff --git a/pkg/scraper/image.go b/pkg/scraper/image.go index e6ededc..2d14b0f 100644 --- a/pkg/scraper/image.go +++ b/pkg/scraper/image.go @@ -1,9 +1,5 @@ package scraper -import ( - "path" -) - type ImageID int64 type Image struct { @@ -16,16 +12,3 @@ type Image struct { LocalFilename string `db:"local_filename"` IsDownloaded bool `db:"is_downloaded"` } - -func ParseAPIMedia(apiMedia APIMedia) Image { - local_filename := get_prefixed_path(path.Base(apiMedia.MediaURLHttps)) - - return Image{ - ID: ImageID(apiMedia.ID), - RemoteURL: apiMedia.MediaURLHttps, - Width: apiMedia.OriginalInfo.Width, - Height: apiMedia.OriginalInfo.Height, - LocalFilename: local_filename, - IsDownloaded: false, - } -} diff --git a/pkg/scraper/link_expander.go b/pkg/scraper/link_expander.go index 74d175a..11943d7 100644 --- a/pkg/scraper/link_expander.go +++ b/pkg/scraper/link_expander.go @@ -3,6 +3,8 @@ package scraper import ( "fmt" "net/http" + "net/url" + "regexp" "time" ) @@ -32,3 +34,43 @@ func ExpandShortUrl(short_url string) string { } return long_url } + +// Given an URL, try to parse it as a tweet url. +// The bool is an `is_ok` value; true if the parse was successful, false if it didn't match +func TryParseTweetUrl(s string) (UserHandle, TweetID, bool) { + parsed_url, err := url.Parse(s) + if err != nil { + return UserHandle(""), TweetID(0), false + } + + if parsed_url.Host != "twitter.com" && parsed_url.Host != "mobile.twitter.com" && parsed_url.Host != "x.com" { + return UserHandle(""), TweetID(0), false + } + + r := regexp.MustCompile(`^/(\w+)/status/(\d+)$`) + matches := r.FindStringSubmatch(parsed_url.Path) + if matches == nil { + return UserHandle(""), TweetID(0), false + } + if len(matches) != 3 { // matches[0] is the full string + panic(matches) + } + return UserHandle(matches[1]), TweetID(int_or_panic(matches[2])), true +} + +/** + * Given a tweet URL, return the corresponding user handle. + * If tweet url is not valid, return an error. + */ +func ParseHandleFromTweetUrl(tweet_url string) (UserHandle, error) { + short_url_regex := regexp.MustCompile(`^https://t.co/\w{5,20}$`) + if short_url_regex.MatchString(tweet_url) { + tweet_url = ExpandShortUrl(tweet_url) + } + + ret, _, is_ok := TryParseTweetUrl(tweet_url) + if !is_ok { + return "", fmt.Errorf("Invalid tweet url: %s", tweet_url) + } + return ret, nil +} diff --git a/pkg/scraper/poll.go b/pkg/scraper/poll.go index 12626c7..3ae0611 100644 --- a/pkg/scraper/poll.go +++ b/pkg/scraper/poll.go @@ -1,9 +1,6 @@ package scraper import ( - "net/url" - "strconv" - "strings" "time" ) @@ -29,6 +26,9 @@ type Poll struct { LastUpdatedAt Timestamp `db:"last_scraped_at"` } +// TODO: view-layer +// - view helpers should go in a view layer + func (p Poll) TotalVotes() int { return p.Choice1_Votes + p.Choice2_Votes + p.Choice3_Votes + p.Choice4_Votes } @@ -48,56 +48,3 @@ func (p Poll) IsWinner(votes int) bool { } return votes >= p.Choice1_Votes && votes >= p.Choice2_Votes && votes >= p.Choice3_Votes && votes >= p.Choice4_Votes } - -func ParseAPIPoll(apiCard APICard) Poll { - card_url, err := url.Parse(apiCard.ShortenedUrl) - if err != nil { - panic(err) - } - id := int_or_panic(card_url.Hostname()) - - ret := Poll{} - ret.ID = PollID(id) - ret.NumChoices = parse_num_choices(apiCard.Name) - ret.VotingDuration = int_or_panic(apiCard.BindingValues.DurationMinutes.StringValue) * 60 - ret.VotingEndsAt, err = TimestampFromString(apiCard.BindingValues.EndDatetimeUTC.StringValue) - if err != nil { - panic(err) - } - ret.LastUpdatedAt, err = TimestampFromString(apiCard.BindingValues.LastUpdatedAt.StringValue) - if err != nil { - panic(err) - } - - ret.Choice1 = apiCard.BindingValues.Choice1.StringValue - ret.Choice1_Votes = int_or_panic(apiCard.BindingValues.Choice1_Count.StringValue) - ret.Choice2 = apiCard.BindingValues.Choice2.StringValue - ret.Choice2_Votes = int_or_panic(apiCard.BindingValues.Choice2_Count.StringValue) - - if ret.NumChoices > 2 { - ret.Choice3 = apiCard.BindingValues.Choice3.StringValue - ret.Choice3_Votes = int_or_panic(apiCard.BindingValues.Choice3_Count.StringValue) - } - if ret.NumChoices > 3 { - ret.Choice4 = apiCard.BindingValues.Choice4.StringValue - ret.Choice4_Votes = int_or_panic(apiCard.BindingValues.Choice4_Count.StringValue) - } - - return ret -} - -func parse_num_choices(card_name string) int { - if strings.Index(card_name, "poll") != 0 || strings.Index(card_name, "choice") != 5 { - panic("Not valid card name: " + card_name) - } - - return int_or_panic(card_name[4:5]) -} - -func int_or_panic(s string) int { - result, err := strconv.Atoi(s) - if err != nil { - panic(err) - } - return result -} diff --git a/pkg/scraper/retweet.go b/pkg/scraper/retweet.go index 4e1e1da..ce94c16 100644 --- a/pkg/scraper/retweet.go +++ b/pkg/scraper/retweet.go @@ -8,16 +8,3 @@ type Retweet struct { RetweetedBy *User RetweetedAt Timestamp `db:"retweeted_at"` } - -func ParseSingleRetweet(apiTweet APITweet) (ret Retweet, err error) { - apiTweet.NormalizeContent() - - ret.RetweetID = TweetID(apiTweet.ID) - ret.TweetID = TweetID(apiTweet.RetweetedStatusID) - ret.RetweetedByID = UserID(apiTweet.UserID) - ret.RetweetedAt, err = TimestampFromString(apiTweet.CreatedAt) - if err != nil { - panic(err) - } - return -} diff --git a/pkg/scraper/retweet_test.go b/pkg/scraper/retweet_test.go index baf9501..f58f6c5 100644 --- a/pkg/scraper/retweet_test.go +++ b/pkg/scraper/retweet_test.go @@ -13,16 +13,23 @@ import ( func TestParseSingleRetweet(t *testing.T) { assert := assert.New(t) + require := require.New(t) data, err := os.ReadFile("test_responses/tweet_that_is_a_retweet.json") if err != nil { panic(err) } var api_tweet APITweet err = json.Unmarshal(data, &api_tweet) - require.NoError(t, err) + require.NoError(err) - retweet, err := ParseSingleRetweet(api_tweet) - require.NoError(t, err) + trove, err := api_tweet.ToTweetTrove() + require.NoError(err) + + require.Len(trove.Tweets, 0) + require.Len(trove.Retweets, 1) + + retweet, is_ok := trove.Retweets[TweetID(1404270043018448896)] + require.True(is_ok) assert.Equal(TweetID(1404270043018448896), retweet.RetweetID) assert.Equal(TweetID(1404269989646028804), retweet.TweetID) diff --git a/pkg/scraper/space.go b/pkg/scraper/space.go index 6392e73..c84e8dd 100644 --- a/pkg/scraper/space.go +++ b/pkg/scraper/space.go @@ -26,6 +26,9 @@ type Space struct { IsDetailsFetched bool `db:"is_details_fetched"` } +// TODO: view-layer +// - view helpers should go in a view layer + func (space Space) FormatDuration() string { duration := space.EndedAt.Time.Sub(space.StartedAt.Time) h := int(duration.Hours()) @@ -37,14 +40,3 @@ func (space Space) FormatDuration() string { } return fmt.Sprintf("%dm%02ds", m, s) } - -func ParseAPISpace(apiCard APICard) Space { - ret := Space{} - ret.ID = SpaceID(apiCard.BindingValues.ID.StringValue) - ret.ShortUrl = apiCard.ShortenedUrl - - // Indicate that this Space needs its details fetched still - ret.IsDetailsFetched = false - - return ret -} diff --git a/pkg/scraper/space_test.go b/pkg/scraper/space_test.go index 8f51e27..7d21a81 100644 --- a/pkg/scraper/space_test.go +++ b/pkg/scraper/space_test.go @@ -1,31 +1,13 @@ package scraper_test import ( - "encoding/json" - "os" "testing" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" . "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" ) -func TestParseSpace(t *testing.T) { - assert := assert.New(t) - data, err := os.ReadFile("test_responses/tweet_content/space.json") - if err != nil { - panic(err) - } - var apiCard APICard - err = json.Unmarshal(data, &apiCard) - require.NoError(t, err) - - space := ParseAPISpace(apiCard) - assert.Equal(SpaceID("1YpKkZVyQjoxj"), space.ID) - assert.Equal("https://t.co/WBPAHNF8Om", space.ShortUrl) -} - func TestFormatSpaceDuration(t *testing.T) { assert := assert.New(t) s := Space{ diff --git a/pkg/scraper/test_responses/tweet_content/space.json b/pkg/scraper/test_responses/tweet_content/space.json deleted file mode 100644 index 525b8a1..0000000 --- a/pkg/scraper/test_responses/tweet_content/space.json +++ /dev/null @@ -1 +0,0 @@ -{"name":"3691233323:audiospace","url":"https://t.co/WBPAHNF8Om","card_type_url":"http://card-type-url-is-deprecated.invalid","binding_values":{"id":{"type":"STRING","string_value":"1YpKkZVyQjoxj"},"narrow_cast_space_type":{"type":"STRING","string_value":"0"},"card_url":{"type":"STRING","string_value":"https://t.co/WBPAHNF8Om","scribe_key":"card_url"}},"card_platform":{"platform":{"device":{"name":"Swift","version":"12"},"audience":{"name":"production"}}}} diff --git a/pkg/scraper/tweet.go b/pkg/scraper/tweet.go index 81f99c9..c987236 100644 --- a/pkg/scraper/tweet.go +++ b/pkg/scraper/tweet.go @@ -5,9 +5,6 @@ import ( "errors" "fmt" "strings" - "time" - - "gitlab.com/offline-twitter/twitter_offline_engine/pkg/terminal_utils" ) var ERR_NO_TWEET = errors.New("Empty tweet") @@ -77,172 +74,6 @@ type Tweet struct { IsConversationScraped bool `db:"is_conversation_scraped"` LastScrapedAt Timestamp `db:"last_scraped_at"` } - -func (t Tweet) String() string { - var author string - if t.User != nil { - author = fmt.Sprintf("%s\n@%s", t.User.DisplayName, t.User.Handle) - } else { - author = "@???" - } - - ret := fmt.Sprintf( - `%s -%s -%s -Replies: %d RT: %d QT: %d Likes: %d -`, - author, - terminal_utils.FormatDate(t.PostedAt.Time), - terminal_utils.WrapText(t.Text, 60), - t.NumReplies, - t.NumRetweets, - t.NumQuoteTweets, - t.NumLikes, - ) - - if len(t.Images) > 0 { - ret += fmt.Sprintf(terminal_utils.COLOR_GREEN+"images: %d\n"+terminal_utils.COLOR_RESET, len(t.Images)) - } - if len(t.Urls) > 0 { - ret += "urls: [\n" - for _, url := range t.Urls { - ret += " " + url.Text + "\n" - } - ret += "]" - } - - return ret -} - -// Turn an APITweet, as returned from the scraper, into a properly structured Tweet object -func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { - apiTweet.NormalizeContent() - - ret.ID = TweetID(apiTweet.ID) - ret.UserID = UserID(apiTweet.UserID) - ret.UserHandle = UserHandle(apiTweet.UserHandle) - ret.Text = apiTweet.FullText - ret.IsExpandable = apiTweet.IsExpandable - - // Process "posted-at" date and time - if apiTweet.TombstoneText == "" { // Skip time parsing for tombstones - ret.PostedAt, err = TimestampFromString(apiTweet.CreatedAt) - if err != nil { - if ret.ID == 0 { - return Tweet{}, fmt.Errorf("unable to parse tweet: %w", ERR_NO_TWEET) - } - return Tweet{}, fmt.Errorf("Error parsing time on tweet ID %d:\n %w", ret.ID, err) - } - } - - ret.NumLikes = apiTweet.FavoriteCount - ret.NumRetweets = apiTweet.RetweetCount - ret.NumReplies = apiTweet.ReplyCount - ret.NumQuoteTweets = apiTweet.QuoteCount - ret.InReplyToID = TweetID(apiTweet.InReplyToStatusID) - ret.QuotedTweetID = TweetID(apiTweet.QuotedStatusID) - - // Process URLs and link previews - for _, url := range apiTweet.Entities.URLs { - var url_object Url - if apiTweet.Card.ShortenedUrl == url.ShortenedUrl { - if apiTweet.Card.Name == "3691233323:audiospace" { - // This "url" is just a link to a Space. Don't process it as a Url - continue - } - url_object = ParseAPIUrlCard(apiTweet.Card) - } - url_object.Text = url.ExpandedURL - url_object.ShortText = url.ShortenedUrl - url_object.TweetID = ret.ID - - // Skip it if it's just the quoted tweet - _, id, is_ok := TryParseTweetUrl(url.ExpandedURL) - if is_ok && id == ret.QuotedTweetID { - continue - } - - ret.Urls = append(ret.Urls, url_object) - } - - // Process images - for _, media := range apiTweet.Entities.Media { - if media.Type != "photo" { - // Videos now have an entry in "Entities.Media" but they can be ignored; the useful bit is in ExtendedEntities - // So skip ones that aren't "photo" - continue - } - new_image := ParseAPIMedia(media) - new_image.TweetID = ret.ID - ret.Images = append(ret.Images, new_image) - } - - // Process hashtags - for _, hashtag := range apiTweet.Entities.Hashtags { - ret.Hashtags = append(ret.Hashtags, hashtag.Text) - } - - // Process `@` mentions and reply-mentions - for _, mention := range apiTweet.Entities.Mentions { - ret.Mentions = append(ret.Mentions, mention.UserName) - } - for _, mention := range strings.Split(apiTweet.Entities.ReplyMentions, " ") { - if mention != "" { - if mention[0] != '@' { - panic(fmt.Errorf("Unknown ReplyMention value %q:\n %w", apiTweet.Entities.ReplyMentions, EXTERNAL_API_ERROR)) - } - ret.ReplyMentions = append(ret.ReplyMentions, mention[1:]) - } - } - - // Process videos - for _, entity := range apiTweet.ExtendedEntities.Media { - if entity.Type != "video" && entity.Type != "animated_gif" { - continue - } - - new_video := ParseAPIVideo(entity) - new_video.TweetID = ret.ID - ret.Videos = append(ret.Videos, new_video) - - // Remove the thumbnail from the Images list - updated_imgs := []Image{} - for _, img := range ret.Images { - if VideoID(img.ID) != new_video.ID { - updated_imgs = append(updated_imgs, img) - } - } - ret.Images = updated_imgs - } - - // Process polls - if strings.Index(apiTweet.Card.Name, "poll") == 0 { - poll := ParseAPIPoll(apiTweet.Card) - poll.TweetID = ret.ID - ret.Polls = []Poll{poll} - } - - // Process spaces - if apiTweet.Card.Name == "3691233323:audiospace" { - space := ParseAPISpace(apiTweet.Card) - ret.Spaces = []Space{space} - ret.SpaceID = space.ID - } - - // Process tombstones and other metadata - ret.TombstoneType = apiTweet.TombstoneText - ret.IsStub = !(ret.TombstoneType == "") - ret.LastScrapedAt = TimestampFromUnix(0) // Caller will change this for the tweet that was actually scraped - ret.IsConversationScraped = false // Safe due to the "No Worsening" principle - - // Extra data that can help piece together tombstoned tweet info - ret.in_reply_to_user_id = UserID(apiTweet.InReplyToUserID) - ret.in_reply_to_user_handle = UserHandle(apiTweet.InReplyToScreenName) - - return -} - // Get a single tweet with no replies from the API. // // args: diff --git a/pkg/scraper/tweet_trove.go b/pkg/scraper/tweet_trove.go index 6ee0e0b..5433251 100644 --- a/pkg/scraper/tweet_trove.go +++ b/pkg/scraper/tweet_trove.go @@ -195,3 +195,13 @@ func (trove *TweetTrove) PostProcess(api *API) error { } return nil } + +func (t TweetTrove) GetOldestMessage(id DMChatRoomID) DMMessageID { + oldest := DMMessageID(^uint(0) >> 1) // Max integer + for _, m := range t.Messages { + if m.ID < oldest && m.DMChatRoomID == id { + oldest = m.ID + } + } + return oldest +} diff --git a/pkg/scraper/url.go b/pkg/scraper/url.go index e8c2dde..f4f1ae2 100644 --- a/pkg/scraper/url.go +++ b/pkg/scraper/url.go @@ -1,11 +1,7 @@ package scraper import ( - "fmt" - "log" "net/url" - "path" - "regexp" ) type Url struct { @@ -28,6 +24,9 @@ type Url struct { IsContentDownloaded bool `db:"is_content_downloaded"` } +// TODO: view-layer +// - view helpers should go in a view layer + func (u Url) GetDomain() string { if u.Domain != "" { return u.Domain @@ -38,106 +37,3 @@ func (u Url) GetDomain() string { } return urlstruct.Host } - -func ParseAPIUrlCard(apiCard APICard) Url { - values := apiCard.BindingValues - ret := Url{} - ret.HasCard = true - - ret.Domain = values.Domain.Value - ret.Title = values.Title.Value - ret.Description = values.Description.Value - ret.IsContentDownloaded = false - ret.CreatorID = UserID(values.Creator.UserValue.Value) - ret.SiteID = UserID(values.Site.UserValue.Value) - - var thumbnail_url string - - if apiCard.Name == "summary_large_image" || apiCard.Name == "summary" { - thumbnail_url = values.Thumbnail.ImageValue.Url - } else if apiCard.Name == "player" { - thumbnail_url = values.PlayerImage.ImageValue.Url - } else if apiCard.Name == "unified_card" { - // TODO: Grok chat previews - log.Print("Grok chat card, not implemented yet-- skipping") - } else { - panic("Unknown card type: " + apiCard.Name) - } - - if thumbnail_url != "" { - ret.HasThumbnail = true - ret.ThumbnailRemoteUrl = thumbnail_url - ret.ThumbnailLocalPath = get_thumbnail_local_path(thumbnail_url) - ret.ThumbnailWidth = values.Thumbnail.ImageValue.Width - ret.ThumbnailHeight = values.Thumbnail.ImageValue.Height - } - - return ret -} - -func get_prefixed_path(p string) string { - local_prefix_regex := regexp.MustCompile(`^[\w-]{2}`) - local_prefix := local_prefix_regex.FindString(p) - if len(local_prefix) != 2 { - panic(fmt.Sprintf("Unable to extract a 2-letter prefix for filename %s", p)) - } - return path.Join(local_prefix, p) -} - -func get_thumbnail_local_path(remote_url string) string { - u, err := url.Parse(remote_url) - if err != nil { - panic(err) - } - if u.RawQuery == "" { - return path.Base(u.Path) - } - query_params, err := url.ParseQuery(u.RawQuery) - if err != nil { - panic(err) - } - - return get_prefixed_path( - fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0]), - ) -} - -// Given an URL, try to parse it as a tweet url. -// The bool is an `is_ok` value; true if the parse was successful, false if it didn't match -func TryParseTweetUrl(s string) (UserHandle, TweetID, bool) { - parsed_url, err := url.Parse(s) - if err != nil { - return UserHandle(""), TweetID(0), false - } - - if parsed_url.Host != "twitter.com" && parsed_url.Host != "mobile.twitter.com" && parsed_url.Host != "x.com" { - return UserHandle(""), TweetID(0), false - } - - r := regexp.MustCompile(`^/(\w+)/status/(\d+)$`) - matches := r.FindStringSubmatch(parsed_url.Path) - if matches == nil { - return UserHandle(""), TweetID(0), false - } - if len(matches) != 3 { // matches[0] is the full string - panic(matches) - } - return UserHandle(matches[1]), TweetID(int_or_panic(matches[2])), true -} - -/** - * Given a tweet URL, return the corresponding user handle. - * If tweet url is not valid, return an error. - */ -func ParseHandleFromTweetUrl(tweet_url string) (UserHandle, error) { - short_url_regex := regexp.MustCompile(`^https://t.co/\w{5,20}$`) - if short_url_regex.MatchString(tweet_url) { - tweet_url = ExpandShortUrl(tweet_url) - } - - ret, _, is_ok := TryParseTweetUrl(tweet_url) - if !is_ok { - return "", fmt.Errorf("Invalid tweet url: %s", tweet_url) - } - return ret, nil -} diff --git a/pkg/scraper/user.go b/pkg/scraper/user.go index c1cdef0..af2edad 100644 --- a/pkg/scraper/user.go +++ b/pkg/scraper/user.go @@ -4,9 +4,6 @@ import ( "fmt" "path" "regexp" - "strings" - - "gitlab.com/offline-twitter/twitter_offline_engine/pkg/terminal_utils" ) const DEFAULT_PROFILE_IMAGE_URL = "https://abs.twimg.com/sticky/default_profile_images/default_profile.png" @@ -15,14 +12,6 @@ const DEFAULT_PROFILE_IMAGE = "default_profile.png" type UserID int64 type UserHandle string -func JoinArrayOfHandles(handles []UserHandle) string { - ret := []string{} - for _, h := range handles { - ret = append(ret, string(h)) - } - return strings.Join(ret, ",") -} - type User struct { ID UserID `db:"id"` DisplayName string `db:"display_name"` @@ -51,40 +40,6 @@ type User struct { IsIdFake bool `db:"is_id_fake"` } -func (u User) String() string { - var verified string - if u.IsVerified { - verified = "[\u2713]" - } - ret := fmt.Sprintf( - `%s%s -@%s - %s - -Following: %d Followers: %d - -Joined %s -%s -%s -`, - u.DisplayName, - verified, - u.Handle, - terminal_utils.WrapText(u.Bio, 60), - u.FollowingCount, - u.FollowersCount, - terminal_utils.FormatDate(u.JoinDate.Time), - u.Location, - u.Website, - ) - if u.PinnedTweet != nil { - ret += "\n" + terminal_utils.WrapText(u.PinnedTweet.Text, 60) - } else { - println("Pinned tweet id:", u.PinnedTweetID) - } - return ret -} - func GetUnknownUser() User { return User{ ID: UserID(0x4000000000000000), // 2^62 @@ -125,63 +80,6 @@ func GetUnknownUserWithHandle(handle UserHandle) User { } } -// Turn an APIUser, as returned from the scraper, into a properly structured User object -func ParseSingleUser(apiUser APIUser) (ret User, err error) { - if apiUser.DoesntExist { - // User may have been deleted, or there was a typo. There's no data to parse - if apiUser.ScreenName == "" { - panic("ScreenName is empty!") - } - ret = GetUnknownUserWithHandle(UserHandle(apiUser.ScreenName)) - return - } - ret.ID = UserID(apiUser.ID) - ret.Handle = UserHandle(apiUser.ScreenName) - if apiUser.IsBanned { - // Banned users won't have any further info, so just return here - ret.IsBanned = true - return - } - ret.DisplayName = apiUser.Name - ret.Bio = apiUser.Description - ret.FollowingCount = apiUser.FriendsCount - ret.FollowersCount = apiUser.FollowersCount - ret.Location = apiUser.Location - if len(apiUser.Entities.URL.Urls) > 0 { - ret.Website = apiUser.Entities.URL.Urls[0].ExpandedURL - } - ret.JoinDate, err = TimestampFromString(apiUser.CreatedAt) - if err != nil { - err = fmt.Errorf("Error parsing time on user ID %d: %w", ret.ID, err) - return - } - ret.IsPrivate = apiUser.Protected - ret.IsVerified = apiUser.Verified - ret.ProfileImageUrl = apiUser.ProfileImageURLHTTPS - - if regexp.MustCompile(`_normal\.\w{2,4}`).MatchString(ret.ProfileImageUrl) { - ret.ProfileImageUrl = strings.ReplaceAll(ret.ProfileImageUrl, "_normal.", ".") - } - ret.BannerImageUrl = apiUser.ProfileBannerURL - - ret.ProfileImageLocalPath = ret.compute_profile_image_local_path() - ret.BannerImageLocalPath = ret.compute_banner_image_local_path() - - if len(apiUser.PinnedTweetIdsStr) > 0 { - ret.PinnedTweetID = TweetID(idstr_to_int(apiUser.PinnedTweetIdsStr[0])) - } - return -} - -// Calls API#GetUserByID and returns the parsed result -func GetUserByID(u_id UserID) (User, error) { - session, err := NewGuestSession() // This endpoint works better if you're not logged in - if err != nil { - return User{}, err - } - return session.GetUserByID(u_id) -} - /** * Make a filename for the profile image, that hopefully won't clobber other ones */ diff --git a/pkg/scraper/video.go b/pkg/scraper/video.go index 25f886a..cbc9e68 100644 --- a/pkg/scraper/video.go +++ b/pkg/scraper/video.go @@ -1,16 +1,7 @@ package scraper -import ( - "net/url" - "path" - "sort" -) - type VideoID int64 -// TODO video-source-user: extract source user information (e.g., someone shares a video -// from someone else). - type Video struct { ID VideoID `db:"id"` TweetID TweetID `db:"tweet_id"` @@ -30,56 +21,3 @@ type Video struct { IsGeoblocked bool `db:"is_geoblocked"` IsGif bool `db:"is_gif"` } - -func get_filename(remote_url string) string { - u, err := url.Parse(remote_url) - if err != nil { - panic(err) - } - return path.Base(u.Path) -} - -func ParseAPIVideo(apiVideo APIExtendedMedia) Video { - variants := apiVideo.VideoInfo.Variants - sort.Sort(variants) - video_remote_url := variants[0].URL - - var view_count int - - r := apiVideo.Ext.MediaStats.R - - switch r.(type) { - case string: - view_count = 0 - case map[string]interface{}: - OK_entry, ok := r.(map[string]interface{})["ok"] - if !ok { - panic("No 'ok' value found in the R!") - } - view_count_str, ok := OK_entry.(map[string]interface{})["viewCount"] - view_count = int_or_panic(view_count_str.(string)) - if !ok { - panic("No 'viewCount' value found in the OK!") - } - } - - local_filename := get_prefixed_path(get_filename(video_remote_url)) - - return Video{ - ID: VideoID(apiVideo.ID), - Width: apiVideo.OriginalInfo.Width, - Height: apiVideo.OriginalInfo.Height, - RemoteURL: video_remote_url, - LocalFilename: local_filename, - - ThumbnailRemoteUrl: apiVideo.MediaURLHttps, - ThumbnailLocalPath: get_prefixed_path(path.Base(apiVideo.MediaURLHttps)), - Duration: apiVideo.VideoInfo.Duration, - ViewCount: view_count, - - IsDownloaded: false, - IsBlockedByDMCA: false, - IsGeoblocked: apiVideo.ExtMediaAvailability.Reason == "Geoblocked", - IsGif: apiVideo.Type == "animated_gif", - } -}