package scraper
import (
"fmt"
"html"
"time"
"strings"
"encoding/json"
"strconv"
"sort"
)
type APIMedia struct {
ID int64 `json:"id_str,string"`
MediaURLHttps string `json:"media_url_https"`
Type string `json:"type"`
URL string `json:"url"`
OriginalInfo struct {
Width int `json:"width"`
Height int `json:"height"`
} `json:"original_info"`
}
type SortableVariants []struct {
Bitrate int `json:"bitrate,omitempty"`
URL string `json:"url"`
}
func (v SortableVariants) Len() int { return len(v) }
func (v SortableVariants) Swap(i, j int) { v[i], v[j] = v[j], v[i] }
func (v SortableVariants) Less(i, j int) bool { return v[i].Bitrate > v[j].Bitrate }
type APIExtendedMedia struct {
ID int64 `json:"id_str,string"`
MediaURLHttps string `json:"media_url_https"`
Type string `json:"type"`
VideoInfo struct {
Variants SortableVariants `json:"variants"`
Duration int `json:"duration_millis"`
} `json:"video_info"`
OriginalInfo struct {
Width int `json:"width"`
Height int `json:"height"`
} `json:"original_info"`
Ext struct {
MediaStats struct {
R interface{} `json:"r"`
} `json:"mediaStats"`
} `json:"ext"`
}
type APICard struct {
Name string `json:"name"`
ShortenedUrl string `json:"url"`
BindingValues struct {
Domain struct {
Value string `json:"string_value"`
} `json:"domain"`
Creator struct {
UserValue struct {
Value int64 `json:"id_str,string"`
} `json:"user_value"`
} `json:"creator"`
Site struct {
UserValue struct {
Value int64 `json:"id_str,string"`
} `json:"user_value"`
} `json:"site"`
Title struct {
Value string `json:"string_value"`
} `json:"title"`
Description struct {
Value string `json:"string_value"`
} `json:"description"`
Thumbnail struct {
ImageValue struct {
Url string `json:"url"`
Width int `json:"width"`
Height int `json:"height"`
} `json:"image_value"`
} `json:"thumbnail_image_large"`
PlayerImage struct {
ImageValue struct {
Url string `json:"url"`
} `json:"image_value"`
} `json:"player_image_large"`
// For polls
Choice1 struct {
StringValue string `json:"string_value"`
} `json:"choice1_label"`
Choice2 struct {
StringValue string `json:"string_value"`
} `json:"choice2_label"`
Choice3 struct {
StringValue string `json:"string_value"`
} `json:"choice3_label"`
Choice4 struct {
StringValue string `json:"string_value"`
} `json:"choice4_label"`
Choice1_Count struct {
StringValue string `json:"string_value"`
} `json:"choice1_count"`
Choice2_Count struct {
StringValue string `json:"string_value"`
} `json:"choice2_count"`
Choice3_Count struct {
StringValue string `json:"string_value"`
} `json:"choice3_count"`
Choice4_Count struct {
StringValue string `json:"string_value"`
} `json:"choice4_count"`
EndDatetimeUTC struct {
StringValue string `json:"string_value"`
} `json:"end_datetime_utc"`
CountsAreFinal struct {
BooleanValue bool `json:"boolean_value"`
} `json:"counts_are_final"`
DurationMinutes struct {
StringValue string `json:"string_value"`
} `json:"duration_minutes"`
LastUpdatedAt struct {
StringValue string `json:"string_value"`
} `json:"last_updated_datetime_utc"`
} `json:"binding_values"`
}
type APITweet struct {
ID int64 `json:"id_str,string"`
ConversationID int64 `json:"conversation_id_str,string"`
CreatedAt string `json:"created_at"`
FavoriteCount int `json:"favorite_count"`
FullText string `json:"full_text"`
DisplayTextRange []int `json:"display_text_range"`
Entities struct {
Hashtags []struct {
Text string `json:"text"`
} `json:"hashtags"`
Media []APIMedia `json:"media"`
URLs []struct {
ExpandedURL string `json:"expanded_url"`
ShortenedUrl string `json:"url"`
} `json:"urls"`
Mentions []struct {
UserName string `json:"screen_name"`
UserID int64 `json:"id_str,string"`
} `json:"user_mentions"`
ReplyMentions string // The leading part of the text which is cut off by "DisplayTextRange"
} `json:"entities"`
ExtendedEntities struct {
Media []APIExtendedMedia `json:"media"`
} `json:"extended_entities"`
InReplyToStatusID int64 `json:"in_reply_to_status_id_str,string"`
InReplyToUserID int64 `json:"in_reply_to_user_id_str,string"`
InReplyToScreenName string `json:"in_reply_to_screen_name"`
ReplyCount int `json:"reply_count"`
RetweetCount int `json:"retweet_count"`
QuoteCount int `json:"quote_count"`
RetweetedStatusIDStr string `json:"retweeted_status_id_str"` // Can be empty string
RetweetedStatusID int64
QuotedStatusIDStr string `json:"quoted_status_id_str"` // Can be empty string
QuotedStatusID int64
QuotedStatusPermalink struct {
URL string `json:"url"`
ExpandedURL string `json:"expanded"`
} `json:"quoted_status_permalink"`
Time time.Time `json:"time"`
UserID int64 `json:"user_id_str,string"`
Card APICard `json:"card"`
TombstoneText string
}
func (t *APITweet) NormalizeContent() {
id, err := strconv.Atoi(t.QuotedStatusIDStr)
if err == nil {
t.QuotedStatusID = int64(id)
}
id, err = strconv.Atoi(t.RetweetedStatusIDStr)
if err == nil {
t.RetweetedStatusID = int64(id)
}
if (len(t.DisplayTextRange) == 2) {
t.Entities.ReplyMentions = strings.TrimSpace(string([]rune(t.FullText)[0:t.DisplayTextRange[0]]))
t.FullText = string([]rune(t.FullText)[t.DisplayTextRange[0]:t.DisplayTextRange[1]])
}
// Handle pasted tweet links that turn into quote tweets but still have a link in them
if t.QuotedStatusID != 0 {
for _, url := range t.Entities.URLs {
if url.ShortenedUrl == t.QuotedStatusPermalink.URL {
t.FullText = strings.ReplaceAll(t.FullText, url.ShortenedUrl, "")
}
}
}
t.FullText = html.UnescapeString(t.FullText)
t.FullText = strings.TrimSpace(t.FullText)
}
func (t APITweet) String() string {
data, err := json.Marshal(t)
if err != nil {
panic(err)
}
return string(data)
}
type APIUser struct {
CreatedAt string `json:"created_at"`
Description string `json:"description"`
Entities struct {
URL struct {
Urls []struct {
ExpandedURL string `json:"expanded_url"`
} `json:"urls"`
} `json:"url"`
} `json:"entities"`
FavouritesCount int `json:"favourites_count"`
FollowersCount int `json:"followers_count"`
FriendsCount int `json:"friends_count"`
ID int64 `json:"id_str,string"`
ListedCount int `json:"listed_count"`
Name string `json:"name"`
Location string `json:"location"`
PinnedTweetIdsStr []string `json:"pinned_tweet_ids_str"` // Dunno how to type-convert an array
ProfileBannerURL string `json:"profile_banner_url"`
ProfileImageURLHTTPS string `json:"profile_image_url_https"`
Protected bool `json:"protected"`
ScreenName string `json:"screen_name"`
StatusesCount int `json:"statuses_count"`
Verified bool `json:"verified"`
}
type UserResponse struct {
Data struct {
User struct {
ID int64 `json:"rest_id,string"`
Legacy APIUser `json:"legacy"`
} `json:"user"`
} `json:"data"`
Errors []struct {
Message string `json:"message"`
Code int `json:"code"`
} `json:"errors"`
}
func (u UserResponse) ConvertToAPIUser() APIUser {
ret := u.Data.User.Legacy
ret.ID = u.Data.User.ID
return ret
}
type Entry struct {
EntryID string `json:"entryId"`
SortIndex int64 `json:"sortIndex,string"`
Content struct {
Item struct {
Content struct {
Tombstone struct {
TombstoneInfo struct {
RichText struct {
Text string `json:"text"`
} `json:"richText"`
} `json:"tombstoneInfo"`
} `json:"tombstone"`
Tweet struct {
ID int64 `json:"id,string"`
} `json:"tweet"`
} `json:"content"`
} `json:"item"`
Operation struct {
Cursor struct {
Value string `json:"value"`
} `json:"cursor"`
} `json:"operation"`
} `json:"content"`
}
func (e Entry) GetTombstoneText() string {
return e.Content.Item.Content.Tombstone.TombstoneInfo.RichText.Text
}
type SortableEntries []Entry
func (e SortableEntries) Len() int { return len(e) }
func (e SortableEntries) Swap(i, j int) { e[i], e[j] = e[j], e[i] }
func (e SortableEntries) Less(i, j int) bool { return e[i].SortIndex > e[j].SortIndex }
type TweetResponse struct {
GlobalObjects struct {
Tweets map[string]APITweet `json:"tweets"`
Users map[string]APIUser `json:"users"`
} `json:"globalObjects"`
Timeline struct {
Instructions []struct {
AddEntries struct {
Entries SortableEntries `json:"entries"`
} `json:"addEntries"`
ReplaceEntry struct {
Entry Entry
} `json:"replaceEntry"`
} `json:"instructions"`
} `json:"timeline"`
}
var tombstone_types = map[string]string{
"This Tweet was deleted by the Tweet author. Learn more": "deleted",
"This Tweet is from a suspended account. Learn more": "suspended",
"You’re unable to view this Tweet because this account owner limits who can view their Tweets. Learn more": "hidden",
"This Tweet is unavailable. Learn more": "unavailable",
"This Tweet violated the Twitter Rules. Learn more": "violated",
"This Tweet is from an account that no longer exists. Learn more": "no longer exists",
}
/**
* Insert tweets into GlobalObjects for each tombstone. Returns a list of users that need to
* be fetched for tombstones.
*/
func (t *TweetResponse) HandleTombstones() []string {
ret := []string{}
entries := t.Timeline.Instructions[0].AddEntries.Entries
sort.Sort(entries)
for i, entry := range entries {
if entry.GetTombstoneText() != "" {
// Try to reconstruct the tombstone tweet
var tombstoned_tweet APITweet
tombstoned_tweet.ID = int64(i) // Set a default to prevent clobbering other tombstones
if i + 1 < len(entries) && entries[i+1].Content.Item.Content.Tweet.ID != 0 {
next_tweet_id := entries[i+1].Content.Item.Content.Tweet.ID
api_tweet, ok := t.GlobalObjects.Tweets[fmt.Sprint(next_tweet_id)]
if !ok {
panic("Weird situation!")
}
tombstoned_tweet.ID = api_tweet.InReplyToStatusID
tombstoned_tweet.UserID = api_tweet.InReplyToUserID
ret = append(ret, api_tweet.InReplyToScreenName)
}
if i - 1 >= 0 && entries[i-1].Content.Item.Content.Tweet.ID != 0 {
prev_tweet_id := entries[i-1].Content.Item.Content.Tweet.ID
_, ok := t.GlobalObjects.Tweets[fmt.Sprint(prev_tweet_id)]
if !ok {
panic("Weird situation 2!")
}
tombstoned_tweet.InReplyToStatusID = prev_tweet_id
}
short_text, ok := tombstone_types[entry.GetTombstoneText()]
if !ok {
panic(fmt.Sprintf("Unknown tombstone text: %s", entry.GetTombstoneText()))
}
tombstoned_tweet.TombstoneText = short_text
// Add the tombstoned tweet to GlobalObjects
t.GlobalObjects.Tweets[fmt.Sprint(tombstoned_tweet.ID)] = tombstoned_tweet
}
}
return ret
}
func (t *TweetResponse) GetCursor() string {
entries := t.Timeline.Instructions[0].AddEntries.Entries
if len(entries) > 0 {
last_entry := entries[len(entries) - 1]
if strings.Contains(last_entry.EntryID, "cursor") {
return last_entry.Content.Operation.Cursor.Value
}
}
// Next, try the other format ("replaceEntry")
instructions := t.Timeline.Instructions
last_replace_entry := instructions[len(instructions) - 1].ReplaceEntry.Entry
if strings.Contains(last_replace_entry.EntryID, "cursor") {
return last_replace_entry.Content.Operation.Cursor.Value
}
return ""
}
/**
* Test for one case of end-of-feed. Cursor increments on each request for some reason, but
* there's no new content. This seems to happen when there's a pinned tweet.
*
* In this case, we look for an "entries" object that has only cursors in it, and no tweets.
*/
func (t *TweetResponse) IsEndOfFeed() bool {
entries := t.Timeline.Instructions[0].AddEntries.Entries
if len(entries) > 2 {
return false
}
for _, e := range entries {
if !strings.Contains(e.EntryID, "cursor") {
return false
}
}
return true
}
func idstr_to_int(idstr string) int64 {
id, err := strconv.Atoi(idstr)
if err != nil {
panic(err)
}
return int64(id)
}