REFACTOR: move as much API code out of the types files and into api_types_... files as possible

- also remove a few useless functions
This commit is contained in:
Alessio 2024-12-21 09:04:00 -08:00
parent 1f44fb0961
commit 850662c3cb
23 changed files with 720 additions and 856 deletions

View File

@ -4,6 +4,10 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"html" "html"
"log"
"net/url"
"path"
"regexp"
"sort" "sort"
"strconv" "strconv"
"strings" "strings"
@ -21,6 +25,19 @@ type APIMedia struct {
} `json:"original_info"` } `json:"original_info"`
} }
func ParseAPIMedia(apiMedia APIMedia) Image {
local_filename := get_prefixed_path(path.Base(apiMedia.MediaURLHttps))
return Image{
ID: ImageID(apiMedia.ID),
RemoteURL: apiMedia.MediaURLHttps,
Width: apiMedia.OriginalInfo.Width,
Height: apiMedia.OriginalInfo.Height,
LocalFilename: local_filename,
IsDownloaded: false,
}
}
type SortableVariants []struct { type SortableVariants []struct {
Bitrate int `json:"bitrate,omitempty"` Bitrate int `json:"bitrate,omitempty"`
URL string `json:"url"` URL string `json:"url"`
@ -137,6 +154,164 @@ type APICard struct {
} `json:"binding_values"` } `json:"binding_values"`
} }
func ParseAPIPoll(apiCard APICard) Poll {
card_url, err := url.Parse(apiCard.ShortenedUrl)
if err != nil {
panic(err)
}
id := int_or_panic(card_url.Hostname())
ret := Poll{}
ret.ID = PollID(id)
ret.NumChoices = parse_num_choices(apiCard.Name)
ret.VotingDuration = int_or_panic(apiCard.BindingValues.DurationMinutes.StringValue) * 60
ret.VotingEndsAt, err = TimestampFromString(apiCard.BindingValues.EndDatetimeUTC.StringValue)
if err != nil {
panic(err)
}
ret.LastUpdatedAt, err = TimestampFromString(apiCard.BindingValues.LastUpdatedAt.StringValue)
if err != nil {
panic(err)
}
ret.Choice1 = apiCard.BindingValues.Choice1.StringValue
ret.Choice1_Votes = int_or_panic(apiCard.BindingValues.Choice1_Count.StringValue)
ret.Choice2 = apiCard.BindingValues.Choice2.StringValue
ret.Choice2_Votes = int_or_panic(apiCard.BindingValues.Choice2_Count.StringValue)
if ret.NumChoices > 2 {
ret.Choice3 = apiCard.BindingValues.Choice3.StringValue
ret.Choice3_Votes = int_or_panic(apiCard.BindingValues.Choice3_Count.StringValue)
}
if ret.NumChoices > 3 {
ret.Choice4 = apiCard.BindingValues.Choice4.StringValue
ret.Choice4_Votes = int_or_panic(apiCard.BindingValues.Choice4_Count.StringValue)
}
return ret
}
func parse_num_choices(card_name string) int {
if strings.Index(card_name, "poll") != 0 || strings.Index(card_name, "choice") != 5 {
panic("Not valid card name: " + card_name)
}
return int_or_panic(card_name[4:5])
}
func ParseAPIVideo(apiVideo APIExtendedMedia) Video {
variants := apiVideo.VideoInfo.Variants
sort.Sort(variants)
video_remote_url := variants[0].URL
var view_count int
r := apiVideo.Ext.MediaStats.R
switch r.(type) {
case string:
view_count = 0
case map[string]interface{}:
OK_entry, ok := r.(map[string]interface{})["ok"]
if !ok {
panic("No 'ok' value found in the R!")
}
view_count_str, ok := OK_entry.(map[string]interface{})["viewCount"]
view_count = int_or_panic(view_count_str.(string))
if !ok {
panic("No 'viewCount' value found in the OK!")
}
}
video_parsed_url, err := url.Parse(video_remote_url)
if err != nil {
panic(err)
}
local_filename := get_prefixed_path(path.Base(video_parsed_url.Path))
return Video{
ID: VideoID(apiVideo.ID),
Width: apiVideo.OriginalInfo.Width,
Height: apiVideo.OriginalInfo.Height,
RemoteURL: video_remote_url,
LocalFilename: local_filename,
ThumbnailRemoteUrl: apiVideo.MediaURLHttps,
ThumbnailLocalPath: get_prefixed_path(path.Base(apiVideo.MediaURLHttps)),
Duration: apiVideo.VideoInfo.Duration,
ViewCount: view_count,
IsDownloaded: false,
IsBlockedByDMCA: false,
IsGeoblocked: apiVideo.ExtMediaAvailability.Reason == "Geoblocked",
IsGif: apiVideo.Type == "animated_gif",
}
}
func ParseAPIUrlCard(apiCard APICard) Url {
values := apiCard.BindingValues
ret := Url{}
ret.HasCard = true
ret.Domain = values.Domain.Value
ret.Title = values.Title.Value
ret.Description = values.Description.Value
ret.IsContentDownloaded = false
ret.CreatorID = UserID(values.Creator.UserValue.Value)
ret.SiteID = UserID(values.Site.UserValue.Value)
var thumbnail_url string
if apiCard.Name == "summary_large_image" || apiCard.Name == "summary" {
thumbnail_url = values.Thumbnail.ImageValue.Url
} else if apiCard.Name == "player" {
thumbnail_url = values.PlayerImage.ImageValue.Url
} else if apiCard.Name == "unified_card" {
// TODO: Grok chat previews
log.Print("Grok chat card, not implemented yet-- skipping")
} else {
panic("Unknown card type: " + apiCard.Name)
}
if thumbnail_url != "" {
ret.HasThumbnail = true
ret.ThumbnailRemoteUrl = thumbnail_url
ret.ThumbnailLocalPath = get_thumbnail_local_path(thumbnail_url)
ret.ThumbnailWidth = values.Thumbnail.ImageValue.Width
ret.ThumbnailHeight = values.Thumbnail.ImageValue.Height
}
return ret
}
func get_prefixed_path(p string) string {
local_prefix_regex := regexp.MustCompile(`^[\w-]{2}`)
local_prefix := local_prefix_regex.FindString(p)
if len(local_prefix) != 2 {
panic(fmt.Sprintf("Unable to extract a 2-letter prefix for filename %s", p))
}
return path.Join(local_prefix, p)
}
func get_thumbnail_local_path(remote_url string) string {
u, err := url.Parse(remote_url)
if err != nil {
panic(err)
}
if u.RawQuery == "" {
return path.Base(u.Path)
}
query_params, err := url.ParseQuery(u.RawQuery)
if err != nil {
panic(err)
}
return get_prefixed_path(
fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0]),
)
}
type APITweet struct { type APITweet struct {
ID int64 `json:"id_str,string"` ID int64 `json:"id_str,string"`
ConversationID int64 `json:"conversation_id_str,string"` ConversationID int64 `json:"conversation_id_str,string"`
@ -184,6 +359,171 @@ type APITweet struct {
IsExpandable bool IsExpandable bool
} }
func (t APITweet) ToTweetTrove() (TweetTrove, error) {
ret := NewTweetTrove()
if t.RetweetedStatusIDStr == "" {
// Parse as a Tweet
new_tweet, err := ParseSingleTweet(t)
if err != nil {
return ret, err
}
ret.Tweets[new_tweet.ID] = new_tweet
for _, space := range new_tweet.Spaces {
ret.Spaces[space.ID] = space
}
} else {
// Parse as a Retweet
new_retweet := Retweet{}
var err error
t.NormalizeContent()
new_retweet.RetweetID = TweetID(t.ID)
new_retweet.TweetID = TweetID(t.RetweetedStatusID)
new_retweet.RetweetedByID = UserID(t.UserID)
new_retweet.RetweetedAt, err = TimestampFromString(t.CreatedAt)
if err != nil {
return ret, err
}
ret.Retweets[new_retweet.RetweetID] = new_retweet
}
return ret, nil
}
// Turn an APITweet, as returned from the scraper, into a properly structured Tweet object
func ParseSingleTweet(t APITweet) (ret Tweet, err error) {
t.NormalizeContent()
ret.ID = TweetID(t.ID)
ret.UserID = UserID(t.UserID)
ret.UserHandle = UserHandle(t.UserHandle)
ret.Text = t.FullText
ret.IsExpandable = t.IsExpandable
// Process "posted-at" date and time
if t.TombstoneText == "" { // Skip time parsing for tombstones
ret.PostedAt, err = TimestampFromString(t.CreatedAt)
if err != nil {
if ret.ID == 0 {
return Tweet{}, fmt.Errorf("unable to parse tweet: %w", ERR_NO_TWEET)
}
return Tweet{}, fmt.Errorf("Error parsing time on tweet ID %d:\n %w", ret.ID, err)
}
}
ret.NumLikes = t.FavoriteCount
ret.NumRetweets = t.RetweetCount
ret.NumReplies = t.ReplyCount
ret.NumQuoteTweets = t.QuoteCount
ret.InReplyToID = TweetID(t.InReplyToStatusID)
ret.QuotedTweetID = TweetID(t.QuotedStatusID)
// Process URLs and link previews
for _, url := range t.Entities.URLs {
var url_object Url
if t.Card.ShortenedUrl == url.ShortenedUrl {
if t.Card.Name == "3691233323:audiospace" {
// This "url" is just a link to a Space. Don't process it as a Url
continue
}
url_object = ParseAPIUrlCard(t.Card)
}
url_object.Text = url.ExpandedURL
url_object.ShortText = url.ShortenedUrl
url_object.TweetID = ret.ID
// Skip it if it's just the quoted tweet
_, id, is_ok := TryParseTweetUrl(url.ExpandedURL)
if is_ok && id == ret.QuotedTweetID {
continue
}
ret.Urls = append(ret.Urls, url_object)
}
// Process images
for _, media := range t.Entities.Media {
if media.Type != "photo" {
// Videos now have an entry in "Entities.Media" but they can be ignored; the useful bit is in ExtendedEntities
// So skip ones that aren't "photo"
continue
}
new_image := ParseAPIMedia(media)
new_image.TweetID = ret.ID
ret.Images = append(ret.Images, new_image)
}
// Process hashtags
for _, hashtag := range t.Entities.Hashtags {
ret.Hashtags = append(ret.Hashtags, hashtag.Text)
}
// Process `@` mentions and reply-mentions
for _, mention := range t.Entities.Mentions {
ret.Mentions = append(ret.Mentions, mention.UserName)
}
for _, mention := range strings.Split(t.Entities.ReplyMentions, " ") {
if mention != "" {
if mention[0] != '@' {
panic(fmt.Errorf("Unknown ReplyMention value %q:\n %w", t.Entities.ReplyMentions, EXTERNAL_API_ERROR))
}
ret.ReplyMentions = append(ret.ReplyMentions, mention[1:])
}
}
// Process videos
for _, entity := range t.ExtendedEntities.Media {
if entity.Type != "video" && entity.Type != "animated_gif" {
continue
}
new_video := ParseAPIVideo(entity)
new_video.TweetID = ret.ID
ret.Videos = append(ret.Videos, new_video)
// Remove the thumbnail from the Images list
updated_imgs := []Image{}
for _, img := range ret.Images {
if VideoID(img.ID) != new_video.ID {
updated_imgs = append(updated_imgs, img)
}
}
ret.Images = updated_imgs
}
// Process polls
if strings.Index(t.Card.Name, "poll") == 0 {
poll := ParseAPIPoll(t.Card)
poll.TweetID = ret.ID
ret.Polls = []Poll{poll}
}
// Process spaces
if t.Card.Name == "3691233323:audiospace" {
space := Space{}
space.ID = SpaceID(t.Card.BindingValues.ID.StringValue)
space.ShortUrl = t.Card.ShortenedUrl
// Indicate that this Space needs its details fetched still
space.IsDetailsFetched = false
ret.Spaces = []Space{space}
ret.SpaceID = space.ID
}
// Process tombstones and other metadata
ret.TombstoneType = t.TombstoneText
ret.IsStub = !(ret.TombstoneType == "")
ret.LastScrapedAt = TimestampFromUnix(0) // Caller will change this for the tweet that was actually scraped
ret.IsConversationScraped = false // Safe due to the "No Worsening" principle
// Extra data that can help piece together tombstoned tweet info
ret.in_reply_to_user_id = UserID(t.InReplyToUserID)
ret.in_reply_to_user_handle = UserHandle(t.InReplyToScreenName)
return
}
func (t *APITweet) NormalizeContent() { func (t *APITweet) NormalizeContent() {
id, err := strconv.Atoi(t.QuotedStatusIDStr) id, err := strconv.Atoi(t.QuotedStatusIDStr)
if err == nil { if err == nil {
@ -260,6 +600,54 @@ type APIUser struct {
DoesntExist bool DoesntExist bool
} }
// Turn an APIUser, as returned from the scraper, into a properly structured User object
func ParseSingleUser(apiUser APIUser) (ret User, err error) {
if apiUser.DoesntExist {
// User may have been deleted, or there was a typo. There's no data to parse
if apiUser.ScreenName == "" {
panic("ScreenName is empty!")
}
ret = GetUnknownUserWithHandle(UserHandle(apiUser.ScreenName))
return
}
ret.ID = UserID(apiUser.ID)
ret.Handle = UserHandle(apiUser.ScreenName)
if apiUser.IsBanned {
// Banned users won't have any further info, so just return here
ret.IsBanned = true
return
}
ret.DisplayName = apiUser.Name
ret.Bio = apiUser.Description
ret.FollowingCount = apiUser.FriendsCount
ret.FollowersCount = apiUser.FollowersCount
ret.Location = apiUser.Location
if len(apiUser.Entities.URL.Urls) > 0 {
ret.Website = apiUser.Entities.URL.Urls[0].ExpandedURL
}
ret.JoinDate, err = TimestampFromString(apiUser.CreatedAt)
if err != nil {
err = fmt.Errorf("Error parsing time on user ID %d: %w", ret.ID, err)
return
}
ret.IsPrivate = apiUser.Protected
ret.IsVerified = apiUser.Verified
ret.ProfileImageUrl = apiUser.ProfileImageURLHTTPS
if regexp.MustCompile(`_normal\.\w{2,4}`).MatchString(ret.ProfileImageUrl) {
ret.ProfileImageUrl = strings.ReplaceAll(ret.ProfileImageUrl, "_normal.", ".")
}
ret.BannerImageUrl = apiUser.ProfileBannerURL
ret.ProfileImageLocalPath = ret.compute_profile_image_local_path()
ret.BannerImageLocalPath = ret.compute_banner_image_local_path()
if len(apiUser.PinnedTweetIdsStr) > 0 {
ret.PinnedTweetID = TweetID(idstr_to_int(apiUser.PinnedTweetIdsStr[0]))
}
return
}
type APINotification struct { type APINotification struct {
ID string `json:"id"` ID string `json:"id"`
TimestampMs int64 `json:"timestampMs,string"` TimestampMs int64 `json:"timestampMs,string"`
@ -565,22 +953,11 @@ func (t *TweetResponse) ToTweetTrove() (TweetTrove, error) {
ret := NewTweetTrove() ret := NewTweetTrove()
for _, single_tweet := range t.GlobalObjects.Tweets { for _, single_tweet := range t.GlobalObjects.Tweets {
if single_tweet.RetweetedStatusIDStr == "" { trove, err := single_tweet.ToTweetTrove()
new_tweet, err := ParseSingleTweet(single_tweet) if err != nil {
if err != nil { return ret, err
return ret, err
}
ret.Tweets[new_tweet.ID] = new_tweet
for _, space := range new_tweet.Spaces {
ret.Spaces[space.ID] = space
}
} else {
new_retweet, err := ParseSingleRetweet(single_tweet)
if err != nil {
return ret, err
}
ret.Retweets[new_retweet.RetweetID] = new_retweet
} }
ret.MergeWith(trove)
} }
for _, user := range t.GlobalObjects.Users { for _, user := range t.GlobalObjects.Users {
@ -597,10 +974,14 @@ func (t *TweetResponse) ToTweetTrove() (TweetTrove, error) {
return ret, nil return ret, nil
} }
func idstr_to_int(idstr string) int64 { func idstr_to_int(s string) int64 {
id, err := strconv.Atoi(idstr) return int64(int_or_panic(s))
}
func int_or_panic(s string) int {
result, err := strconv.Atoi(s)
if err != nil { if err != nil {
panic(err) panic(err)
} }
return int64(id) return result
} }

View File

@ -5,6 +5,7 @@ import (
"fmt" "fmt"
"html" "html"
"net/url" "net/url"
"path"
"strings" "strings"
"github.com/google/uuid" "github.com/google/uuid"
@ -77,38 +78,106 @@ func (m *APIDMMessage) NormalizeContent() {
m.MessageData.Text = strings.TrimSpace(m.MessageData.Text) m.MessageData.Text = strings.TrimSpace(m.MessageData.Text)
} }
func (m APIDMMessage) ToTweetTrove() TweetTrove { func (api_msg APIDMMessage) ToTweetTrove() TweetTrove {
ret := NewTweetTrove() ret := NewTweetTrove()
if m.ID == 0 { if api_msg.ID == 0 {
return ret return ret
} }
m.NormalizeContent() api_msg.NormalizeContent()
result := ParseAPIDMMessage(m)
msg := DMMessage{}
msg.ID = DMMessageID(api_msg.ID)
msg.SentAt = TimestampFromUnixMilli(int64(api_msg.Time))
msg.DMChatRoomID = DMChatRoomID(api_msg.ConversationID)
msg.SenderID = UserID(api_msg.MessageData.SenderID)
msg.Text = api_msg.MessageData.Text
msg.InReplyToID = DMMessageID(api_msg.MessageData.ReplyData.ID) // Will be "0" if not a reply
msg.Reactions = make(map[UserID]DMReaction)
for _, api_reacc := range api_msg.MessageReactions {
reacc := DMReaction{}
reacc.ID = DMMessageID(api_reacc.ID)
reacc.SenderID = UserID(api_reacc.SenderID)
reacc.SentAt = TimestampFromUnixMilli(int64(api_reacc.Time))
reacc.Emoji = api_reacc.Emoji
reacc.DMMessageID = msg.ID
msg.Reactions[reacc.SenderID] = reacc
}
if api_msg.MessageData.Attachment.Photo.ID != 0 {
new_image := ParseAPIMedia(api_msg.MessageData.Attachment.Photo)
new_image.DMMessageID = msg.ID
msg.Images = []Image{new_image}
}
if api_msg.MessageData.Attachment.Video.ID != 0 {
entity := api_msg.MessageData.Attachment.Video
if entity.Type == "video" || entity.Type == "animated_gif" {
new_video := ParseAPIVideo(entity)
new_video.DMMessageID = msg.ID
msg.Videos = append(msg.Videos, new_video)
}
}
// Process URLs and link previews
for _, url := range api_msg.MessageData.Entities.URLs {
// Skip it if it's an embedded tweet
_, id, is_ok := TryParseTweetUrl(url.ExpandedURL)
if is_ok && id == TweetID(api_msg.MessageData.Attachment.Tweet.Status.ID) {
continue
}
// Skip it if it's an embedded image
if api_msg.MessageData.Attachment.Photo.URL == url.ShortenedUrl {
continue
}
// Skip it if it's an embedded video
if api_msg.MessageData.Attachment.Video.URL == url.ShortenedUrl {
continue
}
var new_url Url
if api_msg.MessageData.Attachment.Card.ShortenedUrl == url.ShortenedUrl {
if api_msg.MessageData.Attachment.Card.Name == "3691233323:audiospace" {
// This "url" is just a link to a Space. Don't process it as a Url
// TODO: ...but do process it as a Space?
continue
}
new_url = ParseAPIUrlCard(api_msg.MessageData.Attachment.Card)
}
new_url.Text = url.ExpandedURL
new_url.ShortText = url.ShortenedUrl
new_url.DMMessageID = msg.ID
msg.Urls = append(msg.Urls, new_url)
}
// Parse tweet attachment // Parse tweet attachment
if m.MessageData.Attachment.Tweet.Status.ID != 0 { if api_msg.MessageData.Attachment.Tweet.Status.ID != 0 {
u, err := ParseSingleUser(m.MessageData.Attachment.Tweet.Status.User) u, err := ParseSingleUser(api_msg.MessageData.Attachment.Tweet.Status.User)
if err != nil { if err != nil {
panic(err) panic(err)
} }
ret.Users[u.ID] = u ret.Users[u.ID] = u
t, err := ParseSingleTweet(m.MessageData.Attachment.Tweet.Status.APITweet) t, err := ParseSingleTweet(api_msg.MessageData.Attachment.Tweet.Status.APITweet)
if err != nil { if err != nil {
panic(err) panic(err)
} }
t.UserID = u.ID t.UserID = u.ID
ret.Tweets[t.ID] = t ret.Tweets[t.ID] = t
result.EmbeddedTweetID = t.ID msg.EmbeddedTweetID = t.ID
} }
ret.Messages[result.ID] = result ret.Messages[msg.ID] = msg
// TODO: parse attached images and videos
return ret return ret
} }
type APIDMResponse struct {
InboxInitialState APIInbox `json:"inbox_initial_state"`
InboxTimeline APIInbox `json:"inbox_timeline"`
ConversationTimeline APIInbox `json:"conversation_timeline"`
UserEvents APIInbox `json:"user_events"`
}
type APIDMConversation struct { type APIDMConversation struct {
ConversationID string `json:"conversation_id"` ConversationID string `json:"conversation_id"`
Type string `json:"type"` Type string `json:"type"`
@ -179,13 +248,6 @@ type APIInbox struct {
Conversations map[string]APIDMConversation `json:"conversations"` Conversations map[string]APIDMConversation `json:"conversations"`
} }
type APIDMResponse struct {
InboxInitialState APIInbox `json:"inbox_initial_state"`
InboxTimeline APIInbox `json:"inbox_timeline"`
ConversationTimeline APIInbox `json:"conversation_timeline"`
UserEvents APIInbox `json:"user_events"`
}
func (r APIInbox) ToTweetTrove(current_user_id UserID) TweetTrove { func (r APIInbox) ToTweetTrove(current_user_id UserID) TweetTrove {
ret := NewTweetTrove() ret := NewTweetTrove()
@ -211,8 +273,8 @@ func (r APIInbox) ToTweetTrove(current_user_id UserID) TweetTrove {
ret.MergeWith(entry.Message.ToTweetTrove()) ret.MergeWith(entry.Message.ToTweetTrove())
} }
for _, room := range r.Conversations { for _, api_room := range r.Conversations {
result := ParseAPIDMChatRoom(room, current_user_id) result := ParseAPIDMChatRoom(api_room, current_user_id)
ret.Rooms[result.ID] = result ret.Rooms[result.ID] = result
} }
for _, u := range r.Users { for _, u := range r.Users {
@ -225,6 +287,46 @@ func (r APIInbox) ToTweetTrove(current_user_id UserID) TweetTrove {
return ret return ret
} }
func ParseAPIDMChatRoom(api_room APIDMConversation, current_user_id UserID) DMChatRoom {
result := DMChatRoom{}
result.ID = DMChatRoomID(api_room.ConversationID)
result.Type = api_room.Type
result.LastMessagedAt = TimestampFromUnixMilli(int64(api_room.SortTimestamp))
result.IsNSFW = api_room.NSFW
if result.Type == "GROUP_DM" {
result.CreatedAt = TimestampFromUnixMilli(int64(api_room.CreateTime))
result.CreatedByUserID = UserID(api_room.CreatedByUserID)
result.Name = api_room.Name
result.AvatarImageRemoteURL = api_room.AvatarImage
tmp_url, err := url.Parse(result.AvatarImageRemoteURL)
if err != nil {
panic(err)
}
result.AvatarImageLocalPath = fmt.Sprintf("%s_avatar_%s.%s", result.ID, path.Base(tmp_url.Path), tmp_url.Query().Get("format"))
}
result.Participants = make(map[UserID]DMChatParticipant)
for _, api_participant := range api_room.Participants {
participant := DMChatParticipant{}
participant.UserID = UserID(api_participant.UserID)
participant.DMChatRoomID = result.ID
participant.LastReadEventID = DMMessageID(api_participant.LastReadEventID)
// Process chat settings if this is the logged-in user
if participant.UserID == current_user_id {
participant.IsNotificationsDisabled = api_room.NotificationsDisabled
participant.IsReadOnly = api_room.ReadOnly
participant.IsTrusted = api_room.Trusted
participant.IsMuted = api_room.Muted
participant.Status = api_room.Status
participant.IsChatSettingsValid = true
}
result.Participants[participant.UserID] = participant
}
return result
}
func (api *API) GetDMInbox() (APIInbox, error) { func (api *API) GetDMInbox() (APIInbox, error) {
url, err := url.Parse("https://twitter.com/i/api/1.1/dm/inbox_initial_state.json") url, err := url.Parse("https://twitter.com/i/api/1.1/dm/inbox_initial_state.json")
if err != nil { if err != nil {
@ -284,6 +386,30 @@ func (api *API) GetDMInbox() (APIInbox, error) {
result.InboxInitialState.Status = result.InboxInitialState.InboxTimelines.Trusted.Status result.InboxInitialState.Status = result.InboxInitialState.InboxTimelines.Trusted.Status
return result.InboxInitialState, err return result.InboxInitialState, err
} }
func (api *API) GetInbox(how_many int) (TweetTrove, string, error) {
if !api.IsAuthenticated {
return TweetTrove{}, "", ErrLoginRequired
}
dm_response, err := api.GetDMInbox()
if err != nil {
panic(err)
}
trove := dm_response.ToTweetTrove(api.UserID)
cursor := dm_response.Cursor
next_cursor_id := dm_response.InboxTimelines.Trusted.MinEntryID
for len(trove.Rooms) < how_many && dm_response.Status != "AT_END" {
dm_response, err = api.GetInboxTrusted(next_cursor_id)
if err != nil {
panic(err)
}
next_trove := dm_response.ToTweetTrove(api.UserID)
next_cursor_id = dm_response.MinEntryID
trove.MergeWith(next_trove)
}
return trove, cursor, nil
}
func (api *API) GetInboxTrusted(oldest_id int) (APIInbox, error) { func (api *API) GetInboxTrusted(oldest_id int) (APIInbox, error) {
url, err := url.Parse("https://twitter.com/i/api/1.1/dm/inbox_timeline/trusted.json") url, err := url.Parse("https://twitter.com/i/api/1.1/dm/inbox_timeline/trusted.json")
@ -345,62 +471,87 @@ func (api *API) GetInboxTrusted(oldest_id int) (APIInbox, error) {
return result.InboxTimeline, err return result.InboxTimeline, err
} }
func (api *API) GetDMConversation(id DMChatRoomID, max_id DMMessageID) (APIInbox, error) { func (api *API) GetConversation(room_id DMChatRoomID, max_id DMMessageID, how_many int) (TweetTrove, error) {
url, err := url.Parse("https://twitter.com/i/api/1.1/dm/conversation/" + string(id) + ".json") if !api.IsAuthenticated {
return TweetTrove{}, ErrLoginRequired
}
fetch := func(max_id DMMessageID) (APIInbox, error) {
url, err := url.Parse("https://twitter.com/i/api/1.1/dm/conversation/" + string(room_id) + ".json")
if err != nil {
panic(err)
}
query := url.Query()
query.Add("max_id", fmt.Sprint(max_id))
query.Add("context", "FETCH_DM_CONVERSATION_HISTORY")
query.Add("include_profile_interstitial_type", "1")
query.Add("include_blocking", "1")
query.Add("include_blocked_by", "1")
query.Add("include_followed_by", "1")
query.Add("include_want_retweets", "1")
query.Add("include_mute_edge", "1")
query.Add("include_can_dm", "1")
query.Add("include_can_media_tag", "1")
query.Add("include_ext_has_nft_avatar", "1")
query.Add("include_ext_is_blue_verified", "1")
query.Add("include_ext_verified_type", "1")
query.Add("include_ext_profile_image_shape", "1")
query.Add("skip_status", "1")
query.Add("dm_secret_conversations_enabled", "false")
query.Add("krs_registration_enabled", "true")
query.Add("cards_platform", "Web-12")
query.Add("include_cards", "1")
query.Add("include_ext_alt_text", "true")
query.Add("include_ext_limited_action_results", "true")
query.Add("include_quote_count", "true")
query.Add("include_reply_count", "1")
query.Add("tweet_mode", "extended")
query.Add("include_ext_views", "true")
query.Add("dm_users", "false")
query.Add("include_groups", "true")
query.Add("include_inbox_timelines", "true")
query.Add("include_ext_media_color", "true")
query.Add("supports_reactions", "true")
query.Add("include_conversation_info", "true")
query.Add("ext", strings.Join([]string{
"mediaColor",
"altText",
"mediaStats",
"highlightedLabel",
"hasNftAvatar",
"voiceInfo",
"birdwatchPivot",
"enrichments",
"superFollowMetadata",
"unmentionInfo",
"editControl",
"vibe",
}, ","))
url.RawQuery = query.Encode()
var result APIDMResponse
err = api.do_http(url.String(), "", &result)
return result.ConversationTimeline, err
}
dm_response, err := fetch(max_id)
if err != nil { if err != nil {
panic(err) panic(err)
} }
query := url.Query()
query.Add("max_id", fmt.Sprint(max_id))
query.Add("context", "FETCH_DM_CONVERSATION_HISTORY")
query.Add("include_profile_interstitial_type", "1")
query.Add("include_blocking", "1")
query.Add("include_blocked_by", "1")
query.Add("include_followed_by", "1")
query.Add("include_want_retweets", "1")
query.Add("include_mute_edge", "1")
query.Add("include_can_dm", "1")
query.Add("include_can_media_tag", "1")
query.Add("include_ext_has_nft_avatar", "1")
query.Add("include_ext_is_blue_verified", "1")
query.Add("include_ext_verified_type", "1")
query.Add("include_ext_profile_image_shape", "1")
query.Add("skip_status", "1")
query.Add("dm_secret_conversations_enabled", "false")
query.Add("krs_registration_enabled", "true")
query.Add("cards_platform", "Web-12")
query.Add("include_cards", "1")
query.Add("include_ext_alt_text", "true")
query.Add("include_ext_limited_action_results", "true")
query.Add("include_quote_count", "true")
query.Add("include_reply_count", "1")
query.Add("tweet_mode", "extended")
query.Add("include_ext_views", "true")
query.Add("dm_users", "false")
query.Add("include_groups", "true")
query.Add("include_inbox_timelines", "true")
query.Add("include_ext_media_color", "true")
query.Add("supports_reactions", "true")
query.Add("include_conversation_info", "true")
query.Add("ext", strings.Join([]string{
"mediaColor",
"altText",
"mediaStats",
"highlightedLabel",
"hasNftAvatar",
"voiceInfo",
"birdwatchPivot",
"enrichments",
"superFollowMetadata",
"unmentionInfo",
"editControl",
"vibe",
}, ","))
url.RawQuery = query.Encode()
var result APIDMResponse trove := dm_response.ToTweetTrove(api.UserID)
err = api.do_http(url.String(), "", &result) oldest := trove.GetOldestMessage(room_id)
return result.ConversationTimeline, err for len(trove.Messages) < how_many && dm_response.Status != "AT_END" {
dm_response, err = fetch(oldest)
if err != nil {
panic(err)
}
next_trove := dm_response.ToTweetTrove(api.UserID)
oldest = next_trove.GetOldestMessage(room_id)
trove.MergeWith(next_trove)
}
return trove, nil
} }
// Returns a TweetTrove and the cursor for the next update, or an error // Returns a TweetTrove and the cursor for the next update, or an error
@ -459,6 +610,9 @@ func (api *API) PollInboxUpdates(cursor string) (TweetTrove, string, error) {
return result.UserEvents.ToTweetTrove(api.UserID), result.UserEvents.Cursor, nil return result.UserEvents.ToTweetTrove(api.UserID), result.UserEvents.Cursor, nil
} }
// Writes
// ------
func (api *API) SendDMMessage(room_id DMChatRoomID, text string, in_reply_to_id DMMessageID) (TweetTrove, error) { func (api *API) SendDMMessage(room_id DMChatRoomID, text string, in_reply_to_id DMMessageID) (TweetTrove, error) {
if !api.IsAuthenticated { if !api.IsAuthenticated {
return TweetTrove{}, ErrLoginRequired return TweetTrove{}, ErrLoginRequired

View File

@ -21,7 +21,9 @@ func TestParseAPIDMMessage(t *testing.T) {
err = json.Unmarshal(data, &api_message) err = json.Unmarshal(data, &api_message)
require.NoError(t, err) require.NoError(t, err)
message := ParseAPIDMMessage(api_message) trove := api_message.ToTweetTrove()
message, is_ok := trove.Messages[DMMessageID(api_message.ID)]
require.True(t, is_ok)
assert.Equal(message.ID, DMMessageID(1663623203644751885)) assert.Equal(message.ID, DMMessageID(1663623203644751885))
assert.Equal(message.SentAt, TimestampFromUnixMilli(1685473655064)) assert.Equal(message.SentAt, TimestampFromUnixMilli(1685473655064))
assert.Equal(message.DMChatRoomID, DMChatRoomID("1458284524761075714-1488963321701171204")) assert.Equal(message.DMChatRoomID, DMChatRoomID("1458284524761075714-1488963321701171204"))
@ -41,7 +43,9 @@ func TestParseAPIDMMessageWithReaction(t *testing.T) {
err = json.Unmarshal(data, &api_message) err = json.Unmarshal(data, &api_message)
require.NoError(t, err) require.NoError(t, err)
message := ParseAPIDMMessage(api_message) trove := api_message.ToTweetTrove()
message, is_ok := trove.Messages[DMMessageID(api_message.ID)]
require.True(t, is_ok)
assert.Equal(message.ID, DMMessageID(1663623062195957773)) assert.Equal(message.ID, DMMessageID(1663623062195957773))
require.Len(t, message.Reactions, 1) require.Len(t, message.Reactions, 1)

View File

@ -1390,7 +1390,19 @@ func (api API) GetUser(handle UserHandle) (User, error) {
return ParseSingleUser(apiUser) return ParseSingleUser(apiUser)
} }
// Calls API#GetUserByID and returns the parsed result
func GetUserByID(u_id UserID) (User, error) {
session, err := NewGuestSession() // This endpoint works better if you're not logged in
if err != nil {
return User{}, err
}
return session.GetUserByID(u_id)
}
func (api API) GetUserByID(u_id UserID) (User, error) { func (api API) GetUserByID(u_id UserID) (User, error) {
if u_id == UserID(0) {
panic("No Users with ID 0")
}
url, err := url.Parse(GraphqlURL{ url, err := url.Parse(GraphqlURL{
BaseUrl: "https://x.com/i/api/graphql/Qw77dDjp9xCpUY-AXwt-yQ/UserByRestId", BaseUrl: "https://x.com/i/api/graphql/Qw77dDjp9xCpUY-AXwt-yQ/UserByRestId",
Variables: GraphqlVariables{ Variables: GraphqlVariables{

View File

@ -1,6 +0,0 @@
package scraper
// Tokens
// ------
const BEARER_TOKEN string = "AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"

View File

@ -1,18 +0,0 @@
package scraper
type ConversationID string
type Conversation struct {
ID ConversationID
Type string
SortEventID int
SortTimestamp int
Participants []User
Nsfw bool
NotificationsDisabled bool
LastReadEventId int
ReadOnly bool
Trusted bool
LowQuality bool
Muted bool
}

View File

@ -1,11 +1,5 @@
package scraper package scraper
import (
"fmt"
"net/url"
"path"
)
type DMChatRoomID string type DMChatRoomID string
// A participant in a chat room. // A participant in a chat room.
@ -45,6 +39,8 @@ type DMChatRoom struct {
Participants map[UserID]DMChatParticipant Participants map[UserID]DMChatParticipant
} }
// TODO: view-layer
// - view helpers should go in a view layer
func (r DMChatRoom) GetParticipantIDs() []UserID { func (r DMChatRoom) GetParticipantIDs() []UserID {
ret := []UserID{} ret := []UserID{}
for user_id := range r.Participants { for user_id := range r.Participants {
@ -52,43 +48,3 @@ func (r DMChatRoom) GetParticipantIDs() []UserID {
} }
return ret return ret
} }
func ParseAPIDMChatRoom(api_room APIDMConversation, current_user_id UserID) DMChatRoom {
ret := DMChatRoom{}
ret.ID = DMChatRoomID(api_room.ConversationID)
ret.Type = api_room.Type
ret.LastMessagedAt = TimestampFromUnixMilli(int64(api_room.SortTimestamp))
ret.IsNSFW = api_room.NSFW
if ret.Type == "GROUP_DM" {
ret.CreatedAt = TimestampFromUnixMilli(int64(api_room.CreateTime))
ret.CreatedByUserID = UserID(api_room.CreatedByUserID)
ret.Name = api_room.Name
ret.AvatarImageRemoteURL = api_room.AvatarImage
tmp_url, err := url.Parse(ret.AvatarImageRemoteURL)
if err != nil {
panic(err)
}
ret.AvatarImageLocalPath = fmt.Sprintf("%s_avatar_%s.%s", ret.ID, path.Base(tmp_url.Path), tmp_url.Query().Get("format"))
}
ret.Participants = make(map[UserID]DMChatParticipant)
for _, api_participant := range api_room.Participants {
participant := DMChatParticipant{}
participant.UserID = UserID(api_participant.UserID)
participant.DMChatRoomID = ret.ID
participant.LastReadEventID = DMMessageID(api_participant.LastReadEventID)
// Process chat settings if this is the logged-in user
if participant.UserID == current_user_id {
participant.IsNotificationsDisabled = api_room.NotificationsDisabled
participant.IsReadOnly = api_room.ReadOnly
participant.IsTrusted = api_room.Trusted
participant.IsMuted = api_room.Muted
participant.Status = api_room.Status
participant.IsChatSettingsValid = true
}
ret.Participants[participant.UserID] = participant
}
return ret
}

View File

@ -10,15 +10,6 @@ type DMReaction struct {
Emoji string `db:"emoji"` Emoji string `db:"emoji"`
} }
func ParseAPIDMReaction(reacc APIDMReaction) DMReaction {
ret := DMReaction{}
ret.ID = DMMessageID(reacc.ID)
ret.SenderID = UserID(reacc.SenderID)
ret.SentAt = TimestampFromUnixMilli(int64(reacc.Time))
ret.Emoji = reacc.Emoji
return ret
}
type DMMessage struct { type DMMessage struct {
ID DMMessageID `db:"id"` ID DMMessageID `db:"id"`
DMChatRoomID DMChatRoomID `db:"chat_room_id"` DMChatRoomID DMChatRoomID `db:"chat_room_id"`
@ -33,67 +24,6 @@ type DMMessage struct {
Images []Image Images []Image
Videos []Video Videos []Video
Urls []Url Urls []Url
}
LastReadEventUserIDs []UserID // Used for rendering
func ParseAPIDMMessage(message APIDMMessage) DMMessage {
ret := DMMessage{}
ret.ID = DMMessageID(message.ID)
ret.SentAt = TimestampFromUnixMilli(int64(message.Time))
ret.DMChatRoomID = DMChatRoomID(message.ConversationID)
ret.SenderID = UserID(message.MessageData.SenderID)
ret.Text = message.MessageData.Text
ret.InReplyToID = DMMessageID(message.MessageData.ReplyData.ID) // Will be "0" if not a reply
ret.Reactions = make(map[UserID]DMReaction)
for _, api_reacc := range message.MessageReactions {
reacc := ParseAPIDMReaction(api_reacc)
reacc.DMMessageID = ret.ID
ret.Reactions[reacc.SenderID] = reacc
}
if message.MessageData.Attachment.Photo.ID != 0 {
new_image := ParseAPIMedia(message.MessageData.Attachment.Photo)
new_image.DMMessageID = ret.ID
ret.Images = []Image{new_image}
}
if message.MessageData.Attachment.Video.ID != 0 {
entity := message.MessageData.Attachment.Video
if entity.Type == "video" || entity.Type == "animated_gif" {
new_video := ParseAPIVideo(entity)
new_video.DMMessageID = ret.ID
ret.Videos = append(ret.Videos, new_video)
}
}
// Process URLs and link previews
for _, url := range message.MessageData.Entities.URLs {
// Skip it if it's an embedded tweet
_, id, is_ok := TryParseTweetUrl(url.ExpandedURL)
if is_ok && id == TweetID(message.MessageData.Attachment.Tweet.Status.ID) {
continue
}
// Skip it if it's an embedded image
if message.MessageData.Attachment.Photo.URL == url.ShortenedUrl {
continue
}
// Skip it if it's an embedded video
if message.MessageData.Attachment.Video.URL == url.ShortenedUrl {
continue
}
var new_url Url
if message.MessageData.Attachment.Card.ShortenedUrl == url.ShortenedUrl {
if message.MessageData.Attachment.Card.Name == "3691233323:audiospace" {
// This "url" is just a link to a Space. Don't process it as a Url
continue
}
new_url = ParseAPIUrlCard(message.MessageData.Attachment.Card)
}
new_url.Text = url.ExpandedURL
new_url.ShortText = url.ShortenedUrl
new_url.DMMessageID = ret.ID
ret.Urls = append(ret.Urls, new_url)
}
return ret
} }

View File

@ -1,63 +0,0 @@
package scraper
func (t TweetTrove) GetOldestMessage(id DMChatRoomID) DMMessageID {
oldest := DMMessageID(^uint(0) >> 1) // Max integer
for _, m := range t.Messages {
if m.ID < oldest && m.DMChatRoomID == id {
oldest = m.ID
}
}
return oldest
}
// TODO: Why are these all here? =>
// Returns a TweetTrove and the cursor for the next update
func (api *API) GetInbox(how_many int) (TweetTrove, string, error) {
if !api.IsAuthenticated {
return TweetTrove{}, "", ErrLoginRequired
}
dm_response, err := api.GetDMInbox()
if err != nil {
panic(err)
}
trove := dm_response.ToTweetTrove(api.UserID)
cursor := dm_response.Cursor
next_cursor_id := dm_response.InboxTimelines.Trusted.MinEntryID
for len(trove.Rooms) < how_many && dm_response.Status != "AT_END" {
dm_response, err = api.GetInboxTrusted(next_cursor_id)
if err != nil {
panic(err)
}
next_trove := dm_response.ToTweetTrove(api.UserID)
next_cursor_id = dm_response.MinEntryID
trove.MergeWith(next_trove)
}
return trove, cursor, nil
}
func (api *API) GetConversation(id DMChatRoomID, max_id DMMessageID, how_many int) (TweetTrove, error) {
if !api.IsAuthenticated {
return TweetTrove{}, ErrLoginRequired
}
dm_response, err := api.GetDMConversation(id, max_id)
if err != nil {
panic(err)
}
trove := dm_response.ToTweetTrove(api.UserID)
oldest := trove.GetOldestMessage(id)
for len(trove.Messages) < how_many && dm_response.Status != "AT_END" {
dm_response, err = api.GetDMConversation(id, oldest)
if err != nil {
panic(err)
}
next_trove := dm_response.ToTweetTrove(api.UserID)
oldest = next_trove.GetOldestMessage(id)
trove.MergeWith(next_trove)
}
return trove, nil
}

View File

@ -11,6 +11,8 @@ import (
"time" "time"
) )
const BEARER_TOKEN string = "AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"
type GuestTokenResponse struct { type GuestTokenResponse struct {
Token string `json:"guest_token"` Token string `json:"guest_token"`
RefreshedAt time.Time RefreshedAt time.Time

View File

@ -1,9 +1,5 @@
package scraper package scraper
import (
"path"
)
type ImageID int64 type ImageID int64
type Image struct { type Image struct {
@ -16,16 +12,3 @@ type Image struct {
LocalFilename string `db:"local_filename"` LocalFilename string `db:"local_filename"`
IsDownloaded bool `db:"is_downloaded"` IsDownloaded bool `db:"is_downloaded"`
} }
func ParseAPIMedia(apiMedia APIMedia) Image {
local_filename := get_prefixed_path(path.Base(apiMedia.MediaURLHttps))
return Image{
ID: ImageID(apiMedia.ID),
RemoteURL: apiMedia.MediaURLHttps,
Width: apiMedia.OriginalInfo.Width,
Height: apiMedia.OriginalInfo.Height,
LocalFilename: local_filename,
IsDownloaded: false,
}
}

View File

@ -3,6 +3,8 @@ package scraper
import ( import (
"fmt" "fmt"
"net/http" "net/http"
"net/url"
"regexp"
"time" "time"
) )
@ -32,3 +34,43 @@ func ExpandShortUrl(short_url string) string {
} }
return long_url return long_url
} }
// Given an URL, try to parse it as a tweet url.
// The bool is an `is_ok` value; true if the parse was successful, false if it didn't match
func TryParseTweetUrl(s string) (UserHandle, TweetID, bool) {
parsed_url, err := url.Parse(s)
if err != nil {
return UserHandle(""), TweetID(0), false
}
if parsed_url.Host != "twitter.com" && parsed_url.Host != "mobile.twitter.com" && parsed_url.Host != "x.com" {
return UserHandle(""), TweetID(0), false
}
r := regexp.MustCompile(`^/(\w+)/status/(\d+)$`)
matches := r.FindStringSubmatch(parsed_url.Path)
if matches == nil {
return UserHandle(""), TweetID(0), false
}
if len(matches) != 3 { // matches[0] is the full string
panic(matches)
}
return UserHandle(matches[1]), TweetID(int_or_panic(matches[2])), true
}
/**
* Given a tweet URL, return the corresponding user handle.
* If tweet url is not valid, return an error.
*/
func ParseHandleFromTweetUrl(tweet_url string) (UserHandle, error) {
short_url_regex := regexp.MustCompile(`^https://t.co/\w{5,20}$`)
if short_url_regex.MatchString(tweet_url) {
tweet_url = ExpandShortUrl(tweet_url)
}
ret, _, is_ok := TryParseTweetUrl(tweet_url)
if !is_ok {
return "", fmt.Errorf("Invalid tweet url: %s", tweet_url)
}
return ret, nil
}

View File

@ -1,9 +1,6 @@
package scraper package scraper
import ( import (
"net/url"
"strconv"
"strings"
"time" "time"
) )
@ -29,6 +26,9 @@ type Poll struct {
LastUpdatedAt Timestamp `db:"last_scraped_at"` LastUpdatedAt Timestamp `db:"last_scraped_at"`
} }
// TODO: view-layer
// - view helpers should go in a view layer
func (p Poll) TotalVotes() int { func (p Poll) TotalVotes() int {
return p.Choice1_Votes + p.Choice2_Votes + p.Choice3_Votes + p.Choice4_Votes return p.Choice1_Votes + p.Choice2_Votes + p.Choice3_Votes + p.Choice4_Votes
} }
@ -48,56 +48,3 @@ func (p Poll) IsWinner(votes int) bool {
} }
return votes >= p.Choice1_Votes && votes >= p.Choice2_Votes && votes >= p.Choice3_Votes && votes >= p.Choice4_Votes return votes >= p.Choice1_Votes && votes >= p.Choice2_Votes && votes >= p.Choice3_Votes && votes >= p.Choice4_Votes
} }
func ParseAPIPoll(apiCard APICard) Poll {
card_url, err := url.Parse(apiCard.ShortenedUrl)
if err != nil {
panic(err)
}
id := int_or_panic(card_url.Hostname())
ret := Poll{}
ret.ID = PollID(id)
ret.NumChoices = parse_num_choices(apiCard.Name)
ret.VotingDuration = int_or_panic(apiCard.BindingValues.DurationMinutes.StringValue) * 60
ret.VotingEndsAt, err = TimestampFromString(apiCard.BindingValues.EndDatetimeUTC.StringValue)
if err != nil {
panic(err)
}
ret.LastUpdatedAt, err = TimestampFromString(apiCard.BindingValues.LastUpdatedAt.StringValue)
if err != nil {
panic(err)
}
ret.Choice1 = apiCard.BindingValues.Choice1.StringValue
ret.Choice1_Votes = int_or_panic(apiCard.BindingValues.Choice1_Count.StringValue)
ret.Choice2 = apiCard.BindingValues.Choice2.StringValue
ret.Choice2_Votes = int_or_panic(apiCard.BindingValues.Choice2_Count.StringValue)
if ret.NumChoices > 2 {
ret.Choice3 = apiCard.BindingValues.Choice3.StringValue
ret.Choice3_Votes = int_or_panic(apiCard.BindingValues.Choice3_Count.StringValue)
}
if ret.NumChoices > 3 {
ret.Choice4 = apiCard.BindingValues.Choice4.StringValue
ret.Choice4_Votes = int_or_panic(apiCard.BindingValues.Choice4_Count.StringValue)
}
return ret
}
func parse_num_choices(card_name string) int {
if strings.Index(card_name, "poll") != 0 || strings.Index(card_name, "choice") != 5 {
panic("Not valid card name: " + card_name)
}
return int_or_panic(card_name[4:5])
}
func int_or_panic(s string) int {
result, err := strconv.Atoi(s)
if err != nil {
panic(err)
}
return result
}

View File

@ -8,16 +8,3 @@ type Retweet struct {
RetweetedBy *User RetweetedBy *User
RetweetedAt Timestamp `db:"retweeted_at"` RetweetedAt Timestamp `db:"retweeted_at"`
} }
func ParseSingleRetweet(apiTweet APITweet) (ret Retweet, err error) {
apiTweet.NormalizeContent()
ret.RetweetID = TweetID(apiTweet.ID)
ret.TweetID = TweetID(apiTweet.RetweetedStatusID)
ret.RetweetedByID = UserID(apiTweet.UserID)
ret.RetweetedAt, err = TimestampFromString(apiTweet.CreatedAt)
if err != nil {
panic(err)
}
return
}

View File

@ -13,16 +13,23 @@ import (
func TestParseSingleRetweet(t *testing.T) { func TestParseSingleRetweet(t *testing.T) {
assert := assert.New(t) assert := assert.New(t)
require := require.New(t)
data, err := os.ReadFile("test_responses/tweet_that_is_a_retweet.json") data, err := os.ReadFile("test_responses/tweet_that_is_a_retweet.json")
if err != nil { if err != nil {
panic(err) panic(err)
} }
var api_tweet APITweet var api_tweet APITweet
err = json.Unmarshal(data, &api_tweet) err = json.Unmarshal(data, &api_tweet)
require.NoError(t, err) require.NoError(err)
retweet, err := ParseSingleRetweet(api_tweet) trove, err := api_tweet.ToTweetTrove()
require.NoError(t, err) require.NoError(err)
require.Len(trove.Tweets, 0)
require.Len(trove.Retweets, 1)
retweet, is_ok := trove.Retweets[TweetID(1404270043018448896)]
require.True(is_ok)
assert.Equal(TweetID(1404270043018448896), retweet.RetweetID) assert.Equal(TweetID(1404270043018448896), retweet.RetweetID)
assert.Equal(TweetID(1404269989646028804), retweet.TweetID) assert.Equal(TweetID(1404269989646028804), retweet.TweetID)

View File

@ -26,6 +26,9 @@ type Space struct {
IsDetailsFetched bool `db:"is_details_fetched"` IsDetailsFetched bool `db:"is_details_fetched"`
} }
// TODO: view-layer
// - view helpers should go in a view layer
func (space Space) FormatDuration() string { func (space Space) FormatDuration() string {
duration := space.EndedAt.Time.Sub(space.StartedAt.Time) duration := space.EndedAt.Time.Sub(space.StartedAt.Time)
h := int(duration.Hours()) h := int(duration.Hours())
@ -37,14 +40,3 @@ func (space Space) FormatDuration() string {
} }
return fmt.Sprintf("%dm%02ds", m, s) return fmt.Sprintf("%dm%02ds", m, s)
} }
func ParseAPISpace(apiCard APICard) Space {
ret := Space{}
ret.ID = SpaceID(apiCard.BindingValues.ID.StringValue)
ret.ShortUrl = apiCard.ShortenedUrl
// Indicate that this Space needs its details fetched still
ret.IsDetailsFetched = false
return ret
}

View File

@ -1,31 +1,13 @@
package scraper_test package scraper_test
import ( import (
"encoding/json"
"os"
"testing" "testing"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
. "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" . "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
) )
func TestParseSpace(t *testing.T) {
assert := assert.New(t)
data, err := os.ReadFile("test_responses/tweet_content/space.json")
if err != nil {
panic(err)
}
var apiCard APICard
err = json.Unmarshal(data, &apiCard)
require.NoError(t, err)
space := ParseAPISpace(apiCard)
assert.Equal(SpaceID("1YpKkZVyQjoxj"), space.ID)
assert.Equal("https://t.co/WBPAHNF8Om", space.ShortUrl)
}
func TestFormatSpaceDuration(t *testing.T) { func TestFormatSpaceDuration(t *testing.T) {
assert := assert.New(t) assert := assert.New(t)
s := Space{ s := Space{

View File

@ -1 +0,0 @@
{"name":"3691233323:audiospace","url":"https://t.co/WBPAHNF8Om","card_type_url":"http://card-type-url-is-deprecated.invalid","binding_values":{"id":{"type":"STRING","string_value":"1YpKkZVyQjoxj"},"narrow_cast_space_type":{"type":"STRING","string_value":"0"},"card_url":{"type":"STRING","string_value":"https://t.co/WBPAHNF8Om","scribe_key":"card_url"}},"card_platform":{"platform":{"device":{"name":"Swift","version":"12"},"audience":{"name":"production"}}}}

View File

@ -5,9 +5,6 @@ import (
"errors" "errors"
"fmt" "fmt"
"strings" "strings"
"time"
"gitlab.com/offline-twitter/twitter_offline_engine/pkg/terminal_utils"
) )
var ERR_NO_TWEET = errors.New("Empty tweet") var ERR_NO_TWEET = errors.New("Empty tweet")
@ -77,172 +74,6 @@ type Tweet struct {
IsConversationScraped bool `db:"is_conversation_scraped"` IsConversationScraped bool `db:"is_conversation_scraped"`
LastScrapedAt Timestamp `db:"last_scraped_at"` LastScrapedAt Timestamp `db:"last_scraped_at"`
} }
func (t Tweet) String() string {
var author string
if t.User != nil {
author = fmt.Sprintf("%s\n@%s", t.User.DisplayName, t.User.Handle)
} else {
author = "@???"
}
ret := fmt.Sprintf(
`%s
%s
%s
Replies: %d RT: %d QT: %d Likes: %d
`,
author,
terminal_utils.FormatDate(t.PostedAt.Time),
terminal_utils.WrapText(t.Text, 60),
t.NumReplies,
t.NumRetweets,
t.NumQuoteTweets,
t.NumLikes,
)
if len(t.Images) > 0 {
ret += fmt.Sprintf(terminal_utils.COLOR_GREEN+"images: %d\n"+terminal_utils.COLOR_RESET, len(t.Images))
}
if len(t.Urls) > 0 {
ret += "urls: [\n"
for _, url := range t.Urls {
ret += " " + url.Text + "\n"
}
ret += "]"
}
return ret
}
// Turn an APITweet, as returned from the scraper, into a properly structured Tweet object
func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
apiTweet.NormalizeContent()
ret.ID = TweetID(apiTweet.ID)
ret.UserID = UserID(apiTweet.UserID)
ret.UserHandle = UserHandle(apiTweet.UserHandle)
ret.Text = apiTweet.FullText
ret.IsExpandable = apiTweet.IsExpandable
// Process "posted-at" date and time
if apiTweet.TombstoneText == "" { // Skip time parsing for tombstones
ret.PostedAt, err = TimestampFromString(apiTweet.CreatedAt)
if err != nil {
if ret.ID == 0 {
return Tweet{}, fmt.Errorf("unable to parse tweet: %w", ERR_NO_TWEET)
}
return Tweet{}, fmt.Errorf("Error parsing time on tweet ID %d:\n %w", ret.ID, err)
}
}
ret.NumLikes = apiTweet.FavoriteCount
ret.NumRetweets = apiTweet.RetweetCount
ret.NumReplies = apiTweet.ReplyCount
ret.NumQuoteTweets = apiTweet.QuoteCount
ret.InReplyToID = TweetID(apiTweet.InReplyToStatusID)
ret.QuotedTweetID = TweetID(apiTweet.QuotedStatusID)
// Process URLs and link previews
for _, url := range apiTweet.Entities.URLs {
var url_object Url
if apiTweet.Card.ShortenedUrl == url.ShortenedUrl {
if apiTweet.Card.Name == "3691233323:audiospace" {
// This "url" is just a link to a Space. Don't process it as a Url
continue
}
url_object = ParseAPIUrlCard(apiTweet.Card)
}
url_object.Text = url.ExpandedURL
url_object.ShortText = url.ShortenedUrl
url_object.TweetID = ret.ID
// Skip it if it's just the quoted tweet
_, id, is_ok := TryParseTweetUrl(url.ExpandedURL)
if is_ok && id == ret.QuotedTweetID {
continue
}
ret.Urls = append(ret.Urls, url_object)
}
// Process images
for _, media := range apiTweet.Entities.Media {
if media.Type != "photo" {
// Videos now have an entry in "Entities.Media" but they can be ignored; the useful bit is in ExtendedEntities
// So skip ones that aren't "photo"
continue
}
new_image := ParseAPIMedia(media)
new_image.TweetID = ret.ID
ret.Images = append(ret.Images, new_image)
}
// Process hashtags
for _, hashtag := range apiTweet.Entities.Hashtags {
ret.Hashtags = append(ret.Hashtags, hashtag.Text)
}
// Process `@` mentions and reply-mentions
for _, mention := range apiTweet.Entities.Mentions {
ret.Mentions = append(ret.Mentions, mention.UserName)
}
for _, mention := range strings.Split(apiTweet.Entities.ReplyMentions, " ") {
if mention != "" {
if mention[0] != '@' {
panic(fmt.Errorf("Unknown ReplyMention value %q:\n %w", apiTweet.Entities.ReplyMentions, EXTERNAL_API_ERROR))
}
ret.ReplyMentions = append(ret.ReplyMentions, mention[1:])
}
}
// Process videos
for _, entity := range apiTweet.ExtendedEntities.Media {
if entity.Type != "video" && entity.Type != "animated_gif" {
continue
}
new_video := ParseAPIVideo(entity)
new_video.TweetID = ret.ID
ret.Videos = append(ret.Videos, new_video)
// Remove the thumbnail from the Images list
updated_imgs := []Image{}
for _, img := range ret.Images {
if VideoID(img.ID) != new_video.ID {
updated_imgs = append(updated_imgs, img)
}
}
ret.Images = updated_imgs
}
// Process polls
if strings.Index(apiTweet.Card.Name, "poll") == 0 {
poll := ParseAPIPoll(apiTweet.Card)
poll.TweetID = ret.ID
ret.Polls = []Poll{poll}
}
// Process spaces
if apiTweet.Card.Name == "3691233323:audiospace" {
space := ParseAPISpace(apiTweet.Card)
ret.Spaces = []Space{space}
ret.SpaceID = space.ID
}
// Process tombstones and other metadata
ret.TombstoneType = apiTweet.TombstoneText
ret.IsStub = !(ret.TombstoneType == "")
ret.LastScrapedAt = TimestampFromUnix(0) // Caller will change this for the tweet that was actually scraped
ret.IsConversationScraped = false // Safe due to the "No Worsening" principle
// Extra data that can help piece together tombstoned tweet info
ret.in_reply_to_user_id = UserID(apiTweet.InReplyToUserID)
ret.in_reply_to_user_handle = UserHandle(apiTweet.InReplyToScreenName)
return
}
// Get a single tweet with no replies from the API. // Get a single tweet with no replies from the API.
// //
// args: // args:

View File

@ -195,3 +195,13 @@ func (trove *TweetTrove) PostProcess(api *API) error {
} }
return nil return nil
} }
func (t TweetTrove) GetOldestMessage(id DMChatRoomID) DMMessageID {
oldest := DMMessageID(^uint(0) >> 1) // Max integer
for _, m := range t.Messages {
if m.ID < oldest && m.DMChatRoomID == id {
oldest = m.ID
}
}
return oldest
}

View File

@ -1,11 +1,7 @@
package scraper package scraper
import ( import (
"fmt"
"log"
"net/url" "net/url"
"path"
"regexp"
) )
type Url struct { type Url struct {
@ -28,6 +24,9 @@ type Url struct {
IsContentDownloaded bool `db:"is_content_downloaded"` IsContentDownloaded bool `db:"is_content_downloaded"`
} }
// TODO: view-layer
// - view helpers should go in a view layer
func (u Url) GetDomain() string { func (u Url) GetDomain() string {
if u.Domain != "" { if u.Domain != "" {
return u.Domain return u.Domain
@ -38,106 +37,3 @@ func (u Url) GetDomain() string {
} }
return urlstruct.Host return urlstruct.Host
} }
func ParseAPIUrlCard(apiCard APICard) Url {
values := apiCard.BindingValues
ret := Url{}
ret.HasCard = true
ret.Domain = values.Domain.Value
ret.Title = values.Title.Value
ret.Description = values.Description.Value
ret.IsContentDownloaded = false
ret.CreatorID = UserID(values.Creator.UserValue.Value)
ret.SiteID = UserID(values.Site.UserValue.Value)
var thumbnail_url string
if apiCard.Name == "summary_large_image" || apiCard.Name == "summary" {
thumbnail_url = values.Thumbnail.ImageValue.Url
} else if apiCard.Name == "player" {
thumbnail_url = values.PlayerImage.ImageValue.Url
} else if apiCard.Name == "unified_card" {
// TODO: Grok chat previews
log.Print("Grok chat card, not implemented yet-- skipping")
} else {
panic("Unknown card type: " + apiCard.Name)
}
if thumbnail_url != "" {
ret.HasThumbnail = true
ret.ThumbnailRemoteUrl = thumbnail_url
ret.ThumbnailLocalPath = get_thumbnail_local_path(thumbnail_url)
ret.ThumbnailWidth = values.Thumbnail.ImageValue.Width
ret.ThumbnailHeight = values.Thumbnail.ImageValue.Height
}
return ret
}
func get_prefixed_path(p string) string {
local_prefix_regex := regexp.MustCompile(`^[\w-]{2}`)
local_prefix := local_prefix_regex.FindString(p)
if len(local_prefix) != 2 {
panic(fmt.Sprintf("Unable to extract a 2-letter prefix for filename %s", p))
}
return path.Join(local_prefix, p)
}
func get_thumbnail_local_path(remote_url string) string {
u, err := url.Parse(remote_url)
if err != nil {
panic(err)
}
if u.RawQuery == "" {
return path.Base(u.Path)
}
query_params, err := url.ParseQuery(u.RawQuery)
if err != nil {
panic(err)
}
return get_prefixed_path(
fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0]),
)
}
// Given an URL, try to parse it as a tweet url.
// The bool is an `is_ok` value; true if the parse was successful, false if it didn't match
func TryParseTweetUrl(s string) (UserHandle, TweetID, bool) {
parsed_url, err := url.Parse(s)
if err != nil {
return UserHandle(""), TweetID(0), false
}
if parsed_url.Host != "twitter.com" && parsed_url.Host != "mobile.twitter.com" && parsed_url.Host != "x.com" {
return UserHandle(""), TweetID(0), false
}
r := regexp.MustCompile(`^/(\w+)/status/(\d+)$`)
matches := r.FindStringSubmatch(parsed_url.Path)
if matches == nil {
return UserHandle(""), TweetID(0), false
}
if len(matches) != 3 { // matches[0] is the full string
panic(matches)
}
return UserHandle(matches[1]), TweetID(int_or_panic(matches[2])), true
}
/**
* Given a tweet URL, return the corresponding user handle.
* If tweet url is not valid, return an error.
*/
func ParseHandleFromTweetUrl(tweet_url string) (UserHandle, error) {
short_url_regex := regexp.MustCompile(`^https://t.co/\w{5,20}$`)
if short_url_regex.MatchString(tweet_url) {
tweet_url = ExpandShortUrl(tweet_url)
}
ret, _, is_ok := TryParseTweetUrl(tweet_url)
if !is_ok {
return "", fmt.Errorf("Invalid tweet url: %s", tweet_url)
}
return ret, nil
}

View File

@ -4,9 +4,6 @@ import (
"fmt" "fmt"
"path" "path"
"regexp" "regexp"
"strings"
"gitlab.com/offline-twitter/twitter_offline_engine/pkg/terminal_utils"
) )
const DEFAULT_PROFILE_IMAGE_URL = "https://abs.twimg.com/sticky/default_profile_images/default_profile.png" const DEFAULT_PROFILE_IMAGE_URL = "https://abs.twimg.com/sticky/default_profile_images/default_profile.png"
@ -15,14 +12,6 @@ const DEFAULT_PROFILE_IMAGE = "default_profile.png"
type UserID int64 type UserID int64
type UserHandle string type UserHandle string
func JoinArrayOfHandles(handles []UserHandle) string {
ret := []string{}
for _, h := range handles {
ret = append(ret, string(h))
}
return strings.Join(ret, ",")
}
type User struct { type User struct {
ID UserID `db:"id"` ID UserID `db:"id"`
DisplayName string `db:"display_name"` DisplayName string `db:"display_name"`
@ -51,40 +40,6 @@ type User struct {
IsIdFake bool `db:"is_id_fake"` IsIdFake bool `db:"is_id_fake"`
} }
func (u User) String() string {
var verified string
if u.IsVerified {
verified = "[\u2713]"
}
ret := fmt.Sprintf(
`%s%s
@%s
%s
Following: %d Followers: %d
Joined %s
%s
%s
`,
u.DisplayName,
verified,
u.Handle,
terminal_utils.WrapText(u.Bio, 60),
u.FollowingCount,
u.FollowersCount,
terminal_utils.FormatDate(u.JoinDate.Time),
u.Location,
u.Website,
)
if u.PinnedTweet != nil {
ret += "\n" + terminal_utils.WrapText(u.PinnedTweet.Text, 60)
} else {
println("Pinned tweet id:", u.PinnedTweetID)
}
return ret
}
func GetUnknownUser() User { func GetUnknownUser() User {
return User{ return User{
ID: UserID(0x4000000000000000), // 2^62 ID: UserID(0x4000000000000000), // 2^62
@ -125,63 +80,6 @@ func GetUnknownUserWithHandle(handle UserHandle) User {
} }
} }
// Turn an APIUser, as returned from the scraper, into a properly structured User object
func ParseSingleUser(apiUser APIUser) (ret User, err error) {
if apiUser.DoesntExist {
// User may have been deleted, or there was a typo. There's no data to parse
if apiUser.ScreenName == "" {
panic("ScreenName is empty!")
}
ret = GetUnknownUserWithHandle(UserHandle(apiUser.ScreenName))
return
}
ret.ID = UserID(apiUser.ID)
ret.Handle = UserHandle(apiUser.ScreenName)
if apiUser.IsBanned {
// Banned users won't have any further info, so just return here
ret.IsBanned = true
return
}
ret.DisplayName = apiUser.Name
ret.Bio = apiUser.Description
ret.FollowingCount = apiUser.FriendsCount
ret.FollowersCount = apiUser.FollowersCount
ret.Location = apiUser.Location
if len(apiUser.Entities.URL.Urls) > 0 {
ret.Website = apiUser.Entities.URL.Urls[0].ExpandedURL
}
ret.JoinDate, err = TimestampFromString(apiUser.CreatedAt)
if err != nil {
err = fmt.Errorf("Error parsing time on user ID %d: %w", ret.ID, err)
return
}
ret.IsPrivate = apiUser.Protected
ret.IsVerified = apiUser.Verified
ret.ProfileImageUrl = apiUser.ProfileImageURLHTTPS
if regexp.MustCompile(`_normal\.\w{2,4}`).MatchString(ret.ProfileImageUrl) {
ret.ProfileImageUrl = strings.ReplaceAll(ret.ProfileImageUrl, "_normal.", ".")
}
ret.BannerImageUrl = apiUser.ProfileBannerURL
ret.ProfileImageLocalPath = ret.compute_profile_image_local_path()
ret.BannerImageLocalPath = ret.compute_banner_image_local_path()
if len(apiUser.PinnedTweetIdsStr) > 0 {
ret.PinnedTweetID = TweetID(idstr_to_int(apiUser.PinnedTweetIdsStr[0]))
}
return
}
// Calls API#GetUserByID and returns the parsed result
func GetUserByID(u_id UserID) (User, error) {
session, err := NewGuestSession() // This endpoint works better if you're not logged in
if err != nil {
return User{}, err
}
return session.GetUserByID(u_id)
}
/** /**
* Make a filename for the profile image, that hopefully won't clobber other ones * Make a filename for the profile image, that hopefully won't clobber other ones
*/ */

View File

@ -1,16 +1,7 @@
package scraper package scraper
import (
"net/url"
"path"
"sort"
)
type VideoID int64 type VideoID int64
// TODO video-source-user: extract source user information (e.g., someone shares a video
// from someone else).
type Video struct { type Video struct {
ID VideoID `db:"id"` ID VideoID `db:"id"`
TweetID TweetID `db:"tweet_id"` TweetID TweetID `db:"tweet_id"`
@ -30,56 +21,3 @@ type Video struct {
IsGeoblocked bool `db:"is_geoblocked"` IsGeoblocked bool `db:"is_geoblocked"`
IsGif bool `db:"is_gif"` IsGif bool `db:"is_gif"`
} }
func get_filename(remote_url string) string {
u, err := url.Parse(remote_url)
if err != nil {
panic(err)
}
return path.Base(u.Path)
}
func ParseAPIVideo(apiVideo APIExtendedMedia) Video {
variants := apiVideo.VideoInfo.Variants
sort.Sort(variants)
video_remote_url := variants[0].URL
var view_count int
r := apiVideo.Ext.MediaStats.R
switch r.(type) {
case string:
view_count = 0
case map[string]interface{}:
OK_entry, ok := r.(map[string]interface{})["ok"]
if !ok {
panic("No 'ok' value found in the R!")
}
view_count_str, ok := OK_entry.(map[string]interface{})["viewCount"]
view_count = int_or_panic(view_count_str.(string))
if !ok {
panic("No 'viewCount' value found in the OK!")
}
}
local_filename := get_prefixed_path(get_filename(video_remote_url))
return Video{
ID: VideoID(apiVideo.ID),
Width: apiVideo.OriginalInfo.Width,
Height: apiVideo.OriginalInfo.Height,
RemoteURL: video_remote_url,
LocalFilename: local_filename,
ThumbnailRemoteUrl: apiVideo.MediaURLHttps,
ThumbnailLocalPath: get_prefixed_path(path.Base(apiVideo.MediaURLHttps)),
Duration: apiVideo.VideoInfo.Duration,
ViewCount: view_count,
IsDownloaded: false,
IsBlockedByDMCA: false,
IsGeoblocked: apiVideo.ExtMediaAvailability.Reason == "Geoblocked",
IsGif: apiVideo.Type == "animated_gif",
}
}