355 lines
9.9 KiB
Go
355 lines
9.9 KiB
Go
package scraper
|
|
|
|
import (
|
|
"database/sql/driver"
|
|
"errors"
|
|
"fmt"
|
|
log "github.com/sirupsen/logrus"
|
|
"strings"
|
|
"time"
|
|
|
|
"gitlab.com/offline-twitter/twitter_offline_engine/pkg/terminal_utils"
|
|
)
|
|
|
|
var ERR_NO_TWEET = errors.New("Empty tweet")
|
|
|
|
type TweetID int64
|
|
|
|
type CommaSeparatedList []string
|
|
|
|
func (l *CommaSeparatedList) Scan(src interface{}) error {
|
|
*l = CommaSeparatedList{}
|
|
switch src := src.(type) {
|
|
case string:
|
|
for _, v := range strings.Split(src, ",") {
|
|
if v != "" {
|
|
*l = append(*l, v)
|
|
}
|
|
}
|
|
default:
|
|
panic("Should be a string")
|
|
}
|
|
return nil
|
|
}
|
|
func (l CommaSeparatedList) Value() (driver.Value, error) {
|
|
return strings.Join(l, ","), nil
|
|
}
|
|
|
|
type Tweet struct {
|
|
ID TweetID `db:"id"`
|
|
UserID UserID `db:"user_id"`
|
|
User *User
|
|
Text string `db:"text"`
|
|
IsExpandable bool `db:"is_expandable"`
|
|
PostedAt Timestamp `db:"posted_at"`
|
|
NumLikes int `db:"num_likes"`
|
|
NumRetweets int `db:"num_retweets"`
|
|
NumReplies int `db:"num_replies"`
|
|
NumQuoteTweets int `db:"num_quote_tweets"`
|
|
InReplyToID TweetID `db:"in_reply_to_id"`
|
|
QuotedTweetID TweetID `db:"quoted_tweet_id"`
|
|
|
|
// For processing tombstones
|
|
UserHandle UserHandle
|
|
in_reply_to_user_handle UserHandle
|
|
in_reply_to_user_id UserID
|
|
|
|
Images []Image
|
|
Videos []Video
|
|
Urls []Url
|
|
Polls []Poll
|
|
Mentions CommaSeparatedList `db:"mentions"`
|
|
ReplyMentions CommaSeparatedList `db:"reply_mentions"`
|
|
Hashtags CommaSeparatedList `db:"hashtags"`
|
|
|
|
// TODO get-rid-of-redundant-spaces: Might be good to get rid of `Spaces`. Only used in APIv1 I think.
|
|
// A first-step would be to delete the Spaces after pulling them out of a Tweet into the Trove
|
|
// in ToTweetTrove. Then they will only be getting saved once rather than twice.
|
|
Spaces []Space
|
|
SpaceID SpaceID `db:"space_id"`
|
|
|
|
TombstoneType string `db:"tombstone_type"`
|
|
IsStub bool `db:"is_stub"`
|
|
|
|
IsContentDownloaded bool `db:"is_content_downloaded"`
|
|
IsConversationScraped bool `db:"is_conversation_scraped"`
|
|
LastScrapedAt Timestamp `db:"last_scraped_at"`
|
|
}
|
|
|
|
func (t Tweet) String() string {
|
|
var author string
|
|
if t.User != nil {
|
|
author = fmt.Sprintf("%s\n@%s", t.User.DisplayName, t.User.Handle)
|
|
} else {
|
|
author = "@???"
|
|
}
|
|
|
|
ret := fmt.Sprintf(
|
|
`%s
|
|
%s
|
|
%s
|
|
Replies: %d RT: %d QT: %d Likes: %d
|
|
`,
|
|
author,
|
|
terminal_utils.FormatDate(t.PostedAt.Time),
|
|
terminal_utils.WrapText(t.Text, 60),
|
|
t.NumReplies,
|
|
t.NumRetweets,
|
|
t.NumQuoteTweets,
|
|
t.NumLikes,
|
|
)
|
|
|
|
if len(t.Images) > 0 {
|
|
ret += fmt.Sprintf(terminal_utils.COLOR_GREEN+"images: %d\n"+terminal_utils.COLOR_RESET, len(t.Images))
|
|
}
|
|
if len(t.Urls) > 0 {
|
|
ret += "urls: [\n"
|
|
for _, url := range t.Urls {
|
|
ret += " " + url.Text + "\n"
|
|
}
|
|
ret += "]"
|
|
}
|
|
|
|
return ret
|
|
}
|
|
|
|
// Turn an APITweet, as returned from the scraper, into a properly structured Tweet object
|
|
func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
|
apiTweet.NormalizeContent()
|
|
|
|
ret.ID = TweetID(apiTweet.ID)
|
|
ret.UserID = UserID(apiTweet.UserID)
|
|
ret.UserHandle = UserHandle(apiTweet.UserHandle)
|
|
ret.Text = apiTweet.FullText
|
|
ret.IsExpandable = apiTweet.IsExpandable
|
|
|
|
// Process "posted-at" date and time
|
|
if apiTweet.TombstoneText == "" { // Skip time parsing for tombstones
|
|
ret.PostedAt, err = TimestampFromString(apiTweet.CreatedAt)
|
|
if err != nil {
|
|
if ret.ID == 0 {
|
|
return Tweet{}, fmt.Errorf("unable to parse tweet:\n %w", ERR_NO_TWEET)
|
|
}
|
|
return Tweet{}, fmt.Errorf("Error parsing time on tweet ID %d:\n %w", ret.ID, err)
|
|
}
|
|
}
|
|
|
|
ret.NumLikes = apiTweet.FavoriteCount
|
|
ret.NumRetweets = apiTweet.RetweetCount
|
|
ret.NumReplies = apiTweet.ReplyCount
|
|
ret.NumQuoteTweets = apiTweet.QuoteCount
|
|
ret.InReplyToID = TweetID(apiTweet.InReplyToStatusID)
|
|
ret.QuotedTweetID = TweetID(apiTweet.QuotedStatusID)
|
|
|
|
// Process URLs and link previews
|
|
for _, url := range apiTweet.Entities.URLs {
|
|
var url_object Url
|
|
if apiTweet.Card.ShortenedUrl == url.ShortenedUrl {
|
|
if apiTweet.Card.Name == "3691233323:audiospace" {
|
|
// This "url" is just a link to a Space. Don't process it as a Url
|
|
continue
|
|
}
|
|
url_object = ParseAPIUrlCard(apiTweet.Card)
|
|
}
|
|
url_object.Text = url.ExpandedURL
|
|
url_object.ShortText = url.ShortenedUrl
|
|
url_object.TweetID = ret.ID
|
|
|
|
// Skip it if it's just the quoted tweet
|
|
_, id, is_ok := TryParseTweetUrl(url.ExpandedURL)
|
|
if is_ok && id == ret.QuotedTweetID {
|
|
continue
|
|
}
|
|
|
|
ret.Urls = append(ret.Urls, url_object)
|
|
}
|
|
|
|
// Process images
|
|
for _, media := range apiTweet.Entities.Media {
|
|
if media.Type != "photo" { // TODO: remove this eventually
|
|
panic(fmt.Errorf("Unknown media type %q:\n %w", media.Type, EXTERNAL_API_ERROR))
|
|
}
|
|
new_image := ParseAPIMedia(media)
|
|
new_image.TweetID = ret.ID
|
|
ret.Images = append(ret.Images, new_image)
|
|
}
|
|
|
|
// Process hashtags
|
|
for _, hashtag := range apiTweet.Entities.Hashtags {
|
|
ret.Hashtags = append(ret.Hashtags, hashtag.Text)
|
|
}
|
|
|
|
// Process `@` mentions and reply-mentions
|
|
for _, mention := range apiTweet.Entities.Mentions {
|
|
ret.Mentions = append(ret.Mentions, mention.UserName)
|
|
}
|
|
for _, mention := range strings.Split(apiTweet.Entities.ReplyMentions, " ") {
|
|
if mention != "" {
|
|
if mention[0] != '@' {
|
|
panic(fmt.Errorf("Unknown ReplyMention value %q:\n %w", apiTweet.Entities.ReplyMentions, EXTERNAL_API_ERROR))
|
|
}
|
|
ret.ReplyMentions = append(ret.ReplyMentions, mention[1:])
|
|
}
|
|
}
|
|
|
|
// Process videos
|
|
for _, entity := range apiTweet.ExtendedEntities.Media {
|
|
if entity.Type != "video" && entity.Type != "animated_gif" {
|
|
continue
|
|
}
|
|
|
|
new_video := ParseAPIVideo(entity, ret.ID) // This assigns TweetID
|
|
ret.Videos = append(ret.Videos, new_video)
|
|
|
|
// Remove the thumbnail from the Images list
|
|
updated_imgs := []Image{}
|
|
for _, img := range ret.Images {
|
|
if VideoID(img.ID) != new_video.ID {
|
|
updated_imgs = append(updated_imgs, img)
|
|
}
|
|
}
|
|
ret.Images = updated_imgs
|
|
}
|
|
|
|
// Process polls
|
|
if strings.Index(apiTweet.Card.Name, "poll") == 0 {
|
|
poll := ParseAPIPoll(apiTweet.Card)
|
|
poll.TweetID = ret.ID
|
|
ret.Polls = []Poll{poll}
|
|
}
|
|
|
|
// Process spaces
|
|
if apiTweet.Card.Name == "3691233323:audiospace" {
|
|
space := ParseAPISpace(apiTweet.Card)
|
|
ret.Spaces = []Space{space}
|
|
ret.SpaceID = space.ID
|
|
}
|
|
|
|
// Process tombstones and other metadata
|
|
ret.TombstoneType = apiTweet.TombstoneText
|
|
ret.IsStub = !(ret.TombstoneType == "")
|
|
ret.LastScrapedAt = TimestampFromUnix(0) // Caller will change this for the tweet that was actually scraped
|
|
ret.IsConversationScraped = false // Safe due to the "No Worsening" principle
|
|
|
|
// Extra data that can help piece together tombstoned tweet info
|
|
ret.in_reply_to_user_id = UserID(apiTweet.InReplyToUserID)
|
|
ret.in_reply_to_user_handle = UserHandle(apiTweet.InReplyToScreenName)
|
|
|
|
return
|
|
}
|
|
|
|
/**
|
|
* Get a single tweet with no replies from the API.
|
|
*
|
|
* args:
|
|
* - id: the ID of the tweet to get
|
|
*
|
|
* returns: the single Tweet
|
|
*/
|
|
func GetTweet(id TweetID) (Tweet, error) {
|
|
tweet_response, err := the_api.GetTweet(id, "")
|
|
if err != nil {
|
|
return Tweet{}, fmt.Errorf("Error in API call:\n %w", err)
|
|
}
|
|
|
|
single_tweet, ok := tweet_response.GlobalObjects.Tweets[fmt.Sprint(id)]
|
|
|
|
if !ok {
|
|
return Tweet{}, fmt.Errorf("Didn't get the tweet!")
|
|
}
|
|
|
|
return ParseSingleTweet(single_tweet)
|
|
}
|
|
|
|
/**
|
|
* Return a list of tweets, including the original and the rest of its thread,
|
|
* along with a list of associated users.
|
|
*
|
|
* Mark the main tweet as "is_conversation_downloaded = true", and update its "last_scraped_at"
|
|
* value.
|
|
*
|
|
* args:
|
|
* - id: the ID of the tweet to get
|
|
*
|
|
* returns: the tweet, list of its replies and context, and users associated with those replies
|
|
*/
|
|
func GetTweetFull(id TweetID, how_many int) (trove TweetTrove, err error) {
|
|
tweet_response, err := the_api.GetTweet(id, "")
|
|
if err != nil {
|
|
err = fmt.Errorf("Error getting tweet: %d\n %w", id, err)
|
|
return
|
|
}
|
|
if len(tweet_response.GlobalObjects.Tweets) < how_many &&
|
|
tweet_response.GetCursor() != "" {
|
|
err = the_api.GetMoreReplies(id, &tweet_response, how_many)
|
|
if err != nil {
|
|
err = fmt.Errorf("Error getting more tweet replies: %d\n %w", id, err)
|
|
return
|
|
}
|
|
}
|
|
|
|
// This has to be called BEFORE ToTweetTrove, because it modifies the TweetResponse (adds tombstone tweets to its tweets list)
|
|
tombstoned_users := tweet_response.HandleTombstones()
|
|
|
|
trove, err = tweet_response.ToTweetTrove()
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
trove.TombstoneUsers = tombstoned_users
|
|
|
|
// Quoted tombstones need their user_id filled out from the tombstoned_users list
|
|
log.Debug("Running tweet trove post-processing\n")
|
|
err = trove.PostProcess()
|
|
if err != nil {
|
|
err = fmt.Errorf("Error getting tweet (id %d):\n %w", id, err)
|
|
return
|
|
}
|
|
|
|
// Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at"
|
|
tweet, ok := trove.Tweets[id]
|
|
if !ok {
|
|
panic("Trove didn't contain its own tweet!")
|
|
}
|
|
tweet.LastScrapedAt = Timestamp{time.Now()}
|
|
tweet.IsConversationScraped = true
|
|
trove.Tweets[id] = tweet
|
|
|
|
return
|
|
}
|
|
|
|
func GetTweetFullAPIV2(id TweetID, how_many int) (trove TweetTrove, err error) {
|
|
resp, err := the_api.GetTweetDetail(id, "")
|
|
if err != nil {
|
|
err = fmt.Errorf("Error getting tweet detail: %d\n %w", id, err)
|
|
return
|
|
}
|
|
err = the_api.GetMoreTweetReplies(id, &resp, how_many)
|
|
if err != nil && !errors.Is(err, END_OF_FEED) {
|
|
err = fmt.Errorf("Error getting more replies in tweet detail: %d\n %w", id, err)
|
|
return
|
|
}
|
|
trove, err = resp.ToTweetTrove()
|
|
if err != nil {
|
|
return trove, err
|
|
}
|
|
|
|
// Quoted tombstones need their user_id filled out from the tombstoned_users list
|
|
log.Debug("Running tweet trove post-processing\n")
|
|
err = trove.PostProcess()
|
|
if err != nil {
|
|
err = fmt.Errorf("Error getting tweet (id %d):\n %w", id, err)
|
|
return
|
|
}
|
|
|
|
// Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at"
|
|
tweet, ok := trove.Tweets[id]
|
|
if !ok {
|
|
panic("Trove didn't contain its own tweet!")
|
|
}
|
|
tweet.LastScrapedAt = Timestamp{time.Now()}
|
|
tweet.IsConversationScraped = true
|
|
trove.Tweets[id] = tweet
|
|
|
|
return
|
|
}
|