offline-twitter/pkg/persistence/compound_ssf_queries.go
2024-05-28 21:55:42 -07:00

643 lines
20 KiB
Go

package persistence
import (
"errors"
"fmt"
"strconv"
"strings"
"time"
"gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
)
type SortOrder int
const (
SORT_ORDER_NEWEST SortOrder = iota
SORT_ORDER_OLDEST
SORT_ORDER_MOST_LIKES
SORT_ORDER_MOST_RETWEETS
SORT_ORDER_LIKED_AT
SORT_ORDER_BOOKMARKED_AT
)
func (o SortOrder) String() string {
return []string{"newest", "oldest", "most likes", "most retweets", "liked at", "bookmarked at"}[o]
}
func SortOrderFromString(s string) (SortOrder, bool) {
result, is_ok := map[string]SortOrder{
"newest": SORT_ORDER_NEWEST,
"oldest": SORT_ORDER_OLDEST,
"most likes": SORT_ORDER_MOST_LIKES,
"most retweets": SORT_ORDER_MOST_RETWEETS,
"liked at": SORT_ORDER_LIKED_AT,
"bookmarked at": SORT_ORDER_BOOKMARKED_AT,
}[s]
return result, is_ok // Have to store as temporary variable b/c otherwise it interprets it as single-value and compile fails
}
func (o SortOrder) OrderByClause() string {
switch o {
case SORT_ORDER_NEWEST:
return "order by chrono desc"
case SORT_ORDER_OLDEST:
return "order by chrono asc"
case SORT_ORDER_MOST_LIKES:
return "order by num_likes desc"
case SORT_ORDER_MOST_RETWEETS:
return "order by num_retweets desc"
case SORT_ORDER_LIKED_AT:
return "order by likes_sort_order desc"
case SORT_ORDER_BOOKMARKED_AT:
return "order by bookmarks_sort_order desc"
default:
panic(fmt.Sprintf("Invalid sort order: %d", o))
}
}
func (o SortOrder) PaginationWhereClause() string {
switch o {
case SORT_ORDER_NEWEST:
return "chrono < ?"
case SORT_ORDER_OLDEST:
return "chrono > ?"
case SORT_ORDER_MOST_LIKES:
return "num_likes < ?"
case SORT_ORDER_MOST_RETWEETS:
return "num_retweets < ?"
case SORT_ORDER_LIKED_AT:
return "likes_sort_order < ?"
case SORT_ORDER_BOOKMARKED_AT:
return "bookmarks_sort_order < ?"
default:
panic(fmt.Sprintf("Invalid sort order: %d", o))
}
}
func (o SortOrder) NextCursorValue(r CursorResult) int {
switch o {
case SORT_ORDER_NEWEST:
return r.Chrono
case SORT_ORDER_OLDEST:
return r.Chrono
case SORT_ORDER_MOST_LIKES:
return r.NumLikes
case SORT_ORDER_MOST_RETWEETS:
return r.NumRetweets
case SORT_ORDER_LIKED_AT:
return r.LikeSortOrder
case SORT_ORDER_BOOKMARKED_AT:
return r.BookmarkSortOrder
default:
panic(fmt.Sprintf("Invalid sort order: %d", o))
}
}
func (o SortOrder) NextDMCursorValue(m scraper.DMMessage) int64 {
switch o {
case SORT_ORDER_NEWEST, SORT_ORDER_OLDEST:
return m.SentAt.UnixMilli()
default:
panic(fmt.Sprintf("Invalid sort order for DMs: %d", o))
}
}
// Position in the feed (i.e., whether scrolling up/down is possible)
type CursorPosition int
const (
// This is the top of the feed; `cursor_position` is invalid;
CURSOR_START CursorPosition = iota
// `cursor_position` indicates what should be on the next page;
CURSOR_MIDDLE
// Bottom of the feed has been reached. Subsequent pages will all be empty
CURSOR_END
)
func (c CursorPosition) IsEnd() bool {
return c == CURSOR_END
}
// Whether to require, exclude, or indifferent a type of content
type Filter int
const (
// Filter is not used
NONE Filter = iota
// All results must match the filter
REQUIRE
// Results must not match the filter
EXCLUDE
)
type CursorResult struct {
scraper.Tweet
scraper.Retweet
Chrono int `db:"chrono"`
LikeSortOrder int `db:"likes_sort_order"`
BookmarkSortOrder int `db:"bookmarks_sort_order"`
ByUserID scraper.UserID `db:"by_user_id"`
}
type Cursor struct {
CursorPosition
CursorValue int
SortOrder
PageSize int
// Search params
Keywords []string
FromUserHandle scraper.UserHandle // Tweeted by this user
RetweetedByUserHandle scraper.UserHandle // Retweeted by this user
ByUserHandle scraper.UserHandle // Either tweeted or retweeted by this user
ToUserHandles []scraper.UserHandle // In reply to these users
LikedByUserHandle scraper.UserHandle // Liked by this user
BookmarkedByUserHandle scraper.UserHandle // Bookmarked by this user
ListID scraper.ListID // Either tweeted or retweeted by users from this List
FollowedByUserHandle scraper.UserHandle // Either tweeted or retweeted by users followed by this user
SinceTimestamp scraper.Timestamp
UntilTimestamp scraper.Timestamp
TombstoneType string
FilterLinks Filter
FilterImages Filter
FilterVideos Filter
FilterMedia Filter
FilterPolls Filter
FilterSpaces Filter
FilterReplies Filter
FilterRetweets Filter
FilterOfflineFollowed Filter
}
// Generate a cursor with some reasonable defaults
func NewCursor() Cursor {
return Cursor{
Keywords: []string{},
ToUserHandles: []scraper.UserHandle{},
SinceTimestamp: scraper.TimestampFromUnix(0),
UntilTimestamp: scraper.TimestampFromUnix(0),
CursorPosition: CURSOR_START,
CursorValue: 0,
SortOrder: SORT_ORDER_NEWEST,
PageSize: 50,
FilterRetweets: EXCLUDE,
}
}
// Generate a cursor appropriate for fetching the Offline Timeline
func NewTimelineCursor() Cursor {
return Cursor{
Keywords: []string{},
ToUserHandles: []scraper.UserHandle{},
SinceTimestamp: scraper.TimestampFromUnix(0),
UntilTimestamp: scraper.TimestampFromUnix(0),
CursorPosition: CURSOR_START,
CursorValue: 0,
SortOrder: SORT_ORDER_NEWEST,
PageSize: 50,
FilterOfflineFollowed: REQUIRE,
}
}
// Generate a cursor appropriate for showing a List feed
func NewListCursor(list_id scraper.ListID) Cursor {
return Cursor{
Keywords: []string{},
ToUserHandles: []scraper.UserHandle{},
ListID: list_id,
SinceTimestamp: scraper.TimestampFromUnix(0),
UntilTimestamp: scraper.TimestampFromUnix(0),
CursorPosition: CURSOR_START,
CursorValue: 0,
SortOrder: SORT_ORDER_NEWEST,
PageSize: 50,
}
}
// Generate a cursor appropriate for fetching a User Feed
func NewUserFeedCursor(h scraper.UserHandle) Cursor {
return Cursor{
Keywords: []string{},
ToUserHandles: []scraper.UserHandle{},
SinceTimestamp: scraper.TimestampFromUnix(0),
UntilTimestamp: scraper.TimestampFromUnix(0),
CursorPosition: CURSOR_START,
CursorValue: 0,
SortOrder: SORT_ORDER_NEWEST,
PageSize: 50,
ByUserHandle: h,
}
}
// Generate a cursor appropriate for a user's Media tab
func NewUserFeedMediaCursor(h scraper.UserHandle) Cursor {
return Cursor{
Keywords: []string{},
ToUserHandles: []scraper.UserHandle{},
SinceTimestamp: scraper.TimestampFromUnix(0),
UntilTimestamp: scraper.TimestampFromUnix(0),
CursorPosition: CURSOR_START,
CursorValue: 0,
SortOrder: SORT_ORDER_NEWEST,
PageSize: 50,
ByUserHandle: h,
FilterMedia: REQUIRE,
}
}
// Generate a cursor for a User's Likes
func NewUserFeedLikesCursor(h scraper.UserHandle) Cursor {
return Cursor{
Keywords: []string{},
ToUserHandles: []scraper.UserHandle{},
SinceTimestamp: scraper.TimestampFromUnix(0),
UntilTimestamp: scraper.TimestampFromUnix(0),
CursorPosition: CURSOR_START,
CursorValue: 0,
SortOrder: SORT_ORDER_LIKED_AT,
PageSize: 50,
LikedByUserHandle: h,
}
}
// Generate a cursor for a User's Bookmarks
func NewUserFeedBookmarksCursor(h scraper.UserHandle) Cursor {
return Cursor{
Keywords: []string{},
ToUserHandles: []scraper.UserHandle{},
SinceTimestamp: scraper.TimestampFromUnix(0),
UntilTimestamp: scraper.TimestampFromUnix(0),
CursorPosition: CURSOR_START,
CursorValue: 0,
SortOrder: SORT_ORDER_BOOKMARKED_AT,
PageSize: 50,
BookmarkedByUserHandle: h,
}
}
func NewCursorFromSearchQuery(q string) (Cursor, error) {
ret := NewCursor()
is_in_quotes := false
current_token := ""
for _, char := range q {
if char == ' ' && !is_in_quotes {
// Token is finished
if current_token == "" {
// Ignore empty tokens
continue
}
// Add the completed token
if err := ret.apply_token(current_token); err != nil {
return Cursor{}, err
}
current_token = ""
continue
}
if char == '"' {
if is_in_quotes {
is_in_quotes = false
if err := ret.apply_token(current_token); err != nil {
return Cursor{}, err
}
current_token = ""
continue
} else {
is_in_quotes = true
continue
}
}
// current_token = fmt.Sprintf("%s%s", current_token, char)
current_token += string(char)
}
// End of query string is reached
if is_in_quotes {
return Cursor{}, ErrUnmatchedQuotes
}
if current_token != "" {
if err := ret.apply_token(current_token); err != nil {
return Cursor{}, err
}
}
return ret, nil
}
var ErrInvalidQuery = errors.New("invalid search query")
var ErrUnmatchedQuotes = fmt.Errorf("%w (unmatched quotes)", ErrInvalidQuery)
func (c *Cursor) apply_token(token string) error {
parts := strings.Split(token, ":")
if len(parts) < 2 {
c.Keywords = append(c.Keywords, token)
return nil
}
var err error
switch parts[0] {
case "from":
c.FromUserHandle = scraper.UserHandle(parts[1])
case "to":
c.ToUserHandles = append(c.ToUserHandles, scraper.UserHandle(parts[1]))
case "retweeted_by":
c.RetweetedByUserHandle = scraper.UserHandle(parts[1])
c.FilterRetweets = NONE // Clear the "exclude retweets" filter set by default in NewCursor
case "liked_by":
c.LikedByUserHandle = scraper.UserHandle(parts[1])
case "bookmarked_by":
c.BookmarkedByUserHandle = scraper.UserHandle(parts[1])
case "followed_by":
c.FollowedByUserHandle = scraper.UserHandle(parts[1])
case "list":
i, err := strconv.Atoi(parts[1])
if err != nil {
return fmt.Errorf("%w: filter 'list:' must be a number (list ID), got %q", ErrInvalidQuery, parts[1])
}
c.ListID = scraper.ListID(i)
case "since":
c.SinceTimestamp.Time, err = time.Parse("2006-01-02", parts[1])
case "until":
c.UntilTimestamp.Time, err = time.Parse("2006-01-02", parts[1])
case "tombstone":
c.TombstoneType = parts[1]
case "filter":
switch parts[1] {
case "links":
c.FilterLinks = REQUIRE
case "images":
c.FilterImages = REQUIRE
case "videos":
c.FilterVideos = REQUIRE
case "media":
c.FilterMedia = REQUIRE
case "polls":
c.FilterPolls = REQUIRE
case "spaces":
c.FilterSpaces = REQUIRE
case "replies":
c.FilterReplies = REQUIRE
case "retweets":
c.FilterRetweets = REQUIRE
}
case "-filter":
switch parts[1] {
case "links":
c.FilterLinks = EXCLUDE
case "images":
c.FilterImages = EXCLUDE
case "videos":
c.FilterVideos = EXCLUDE
case "media":
c.FilterMedia = EXCLUDE
case "polls":
c.FilterPolls = EXCLUDE
case "spaces":
c.FilterSpaces = EXCLUDE
case "replies":
c.FilterReplies = EXCLUDE
case "retweets":
c.FilterRetweets = EXCLUDE
}
}
if err != nil {
return fmt.Errorf("query token %q: %w", token, ErrInvalidQuery)
}
return nil
}
func (p Profile) NextPage(c Cursor, current_user_id scraper.UserID) (Feed, error) {
where_clauses := []string{}
bind_values := []interface{}{}
// Keywords
for _, kw := range c.Keywords {
where_clauses = append(where_clauses, "text like ?")
bind_values = append(bind_values, fmt.Sprintf("%%%s%%", kw))
}
// From, to, by, and RT'd by user handles
if c.FromUserHandle != "" {
where_clauses = append(where_clauses, "tweets.user_id = (select id from users where handle like ?)")
bind_values = append(bind_values, c.FromUserHandle)
}
for _, to_user := range c.ToUserHandles {
where_clauses = append(where_clauses, "reply_mentions like ?")
bind_values = append(bind_values, fmt.Sprintf("%%%s%%", to_user))
}
if c.RetweetedByUserHandle != "" {
where_clauses = append(where_clauses, "retweeted_by = (select id from users where handle like ?)")
bind_values = append(bind_values, c.RetweetedByUserHandle)
}
if c.ByUserHandle != "" {
where_clauses = append(where_clauses, "by_user_id = (select id from users where handle like ?)")
bind_values = append(bind_values, c.ByUserHandle)
}
if c.ListID != 0 {
where_clauses = append(where_clauses, "by_user_id in (select user_id from list_users where list_id = ?)")
bind_values = append(bind_values, c.ListID)
}
if c.FollowedByUserHandle != "" {
where_clauses = append(where_clauses,
"by_user_id in (select followee_id from follows where follower_id = (select id from users where handle like ?))")
bind_values = append(bind_values, c.FollowedByUserHandle)
}
// Since and until timestamps
if c.SinceTimestamp.Unix() != 0 {
where_clauses = append(where_clauses, "posted_at > ?")
bind_values = append(bind_values, c.SinceTimestamp)
}
if c.UntilTimestamp.Unix() != 0 {
where_clauses = append(where_clauses, "posted_at < ?")
bind_values = append(bind_values, c.UntilTimestamp)
}
// Tombstone filter
if c.TombstoneType == "true" {
where_clauses = append(where_clauses, "tombstone_type != 0")
} else if c.TombstoneType != "" {
where_clauses = append(where_clauses, "tombstone_type = (select rowid from tombstone_types where short_name like ?)")
bind_values = append(bind_values, c.TombstoneType)
}
// Media filters
switch c.FilterLinks {
case REQUIRE:
where_clauses = append(where_clauses, "exists (select 1 from urls where urls.tweet_id = tweets.id)")
case EXCLUDE:
where_clauses = append(where_clauses, "not exists (select 1 from urls where urls.tweet_id = tweets.id)")
}
switch c.FilterImages {
case REQUIRE:
where_clauses = append(where_clauses, "exists (select 1 from images where images.tweet_id = tweets.id)")
case EXCLUDE:
where_clauses = append(where_clauses, "not exists (select 1 from images where images.tweet_id = tweets.id)")
}
switch c.FilterVideos {
case REQUIRE:
where_clauses = append(where_clauses, "exists (select 1 from videos where videos.tweet_id = tweets.id)")
case EXCLUDE:
where_clauses = append(where_clauses, "not exists (select 1 from videos where videos.tweet_id = tweets.id)")
}
switch c.FilterMedia {
case REQUIRE:
where_clauses = append(where_clauses, `(exists (select 1 from videos where videos.tweet_id = tweets.id)
or exists (select 1 from images where images.tweet_id = tweets.id))`)
case EXCLUDE:
where_clauses = append(where_clauses, `not (exists (select 1 from videos where videos.tweet_id = tweets.id)
or exists (select 1 from images where images.tweet_id = tweets.id))`)
}
switch c.FilterPolls {
case REQUIRE:
where_clauses = append(where_clauses, "exists (select 1 from polls where polls.tweet_id = tweets.id)")
case EXCLUDE:
where_clauses = append(where_clauses, "not exists (select 1 from polls where polls.tweet_id = tweets.id)")
}
switch c.FilterSpaces {
case REQUIRE:
where_clauses = append(where_clauses, "space_id != 0")
case EXCLUDE:
where_clauses = append(where_clauses, "space_id = 0")
}
// Filter by lists (e.g., offline-followed)
switch c.FilterOfflineFollowed {
case REQUIRE:
where_clauses = append(where_clauses, "by_user_id in (select id from users where is_followed = 1)")
case EXCLUDE:
where_clauses = append(where_clauses, "by_user_id not in (select id from users where is_followed = 1)")
}
switch c.FilterReplies {
case REQUIRE:
where_clauses = append(where_clauses, "in_reply_to_id != 0")
case EXCLUDE:
where_clauses = append(where_clauses, "in_reply_to_id = 0")
}
switch c.FilterRetweets {
case REQUIRE:
where_clauses = append(where_clauses, "retweet_id != 0")
case EXCLUDE:
where_clauses = append(where_clauses, "retweet_id = 0")
}
liked_by_filter_join_clause := ""
likes_sort_order_field := ""
if c.LikedByUserHandle != "" {
liked_by_filter_join_clause = " join likes filter_likes on tweets.id = filter_likes.tweet_id "
where_clauses = append(where_clauses, "filter_likes.user_id = (select id from users where handle like ?) ")
bind_values = append(bind_values, c.LikedByUserHandle)
likes_sort_order_field = ", coalesce(filter_likes.sort_order, -1) likes_sort_order "
// Don't include retweets on "liked by" searches because it doesn't distinguish which retweet
// version was the "liked" one
where_clauses = append(where_clauses, "retweet_id = 0")
}
bookmarked_by_filter_join_clause := ""
bookmarks_sort_order_field := ""
if c.BookmarkedByUserHandle != "" {
bookmarked_by_filter_join_clause = " join bookmarks filter_bookmarks on tweets.id = filter_bookmarks.tweet_id "
where_clauses = append(where_clauses, "filter_bookmarks.user_id = (select id from users where handle like ?) ")
bind_values = append(bind_values, c.BookmarkedByUserHandle)
bookmarks_sort_order_field = ", coalesce(filter_bookmarks.sort_order, -1) bookmarks_sort_order "
// Don't include retweets on "bookmarked by" searches because it doesn't distinguish which retweet
// version was the "bookmarked" one
where_clauses = append(where_clauses, "retweet_id = 0")
}
// Pagination
if c.CursorPosition != CURSOR_START {
where_clauses = append(where_clauses, c.SortOrder.PaginationWhereClause())
bind_values = append(bind_values, c.CursorValue)
}
// Assemble the full where-clause
where_clause := ""
if len(where_clauses) > 0 {
where_clause = "where " + strings.Join(where_clauses, " and ")
}
// The Query:
// 1. Base query:
// a. Include "likes_sort_order" and "bookmarks_sort_order" fields, if they're in the filters
// b. Left join on "likes" table to get whether logged-in user has liked the tweet
// c. Left join on "likes" and "bookmarks" tables, if needed (i.e., if in the filters)
// d. Add 'where', 'order by', and (mildly unnecessary) 'limit' clauses
// 2. Two copies of the base query, one for "tweets" and one for "retweets", joined with "union"
// 3. Actual "limit" clause
q := `select * from (
select ` + TWEETS_ALL_SQL_FIELDS + likes_sort_order_field + bookmarks_sort_order_field + `,
0 tweet_id, 0 retweet_id, 0 retweeted_by, 0 retweeted_at,
posted_at chrono, tweets.user_id by_user_id
from tweets
left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid
left join likes on tweets.id = likes.tweet_id and likes.user_id = ?
` + liked_by_filter_join_clause + `
` + bookmarked_by_filter_join_clause + `
` + where_clause + ` ` + c.SortOrder.OrderByClause() + ` limit ?
)
union
select * from (
select ` + TWEETS_ALL_SQL_FIELDS + likes_sort_order_field + bookmarks_sort_order_field + `,
retweets.tweet_id, retweet_id, retweeted_by, retweeted_at,
retweeted_at chrono, retweeted_by by_user_id
from retweets
left join tweets on retweets.tweet_id = tweets.id
left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid
left join likes on tweets.id = likes.tweet_id and likes.user_id = ?
` + liked_by_filter_join_clause + `
` + bookmarked_by_filter_join_clause + `
` + where_clause + ` ` + c.SortOrder.OrderByClause() + ` limit ?
)` + c.SortOrder.OrderByClause() + ` limit ?`
bind_values = append([]interface{}{current_user_id}, bind_values...)
bind_values = append(bind_values, c.PageSize)
bind_values = append(bind_values, bind_values...)
bind_values = append(bind_values, c.PageSize)
// fmt.Printf("Query: %s\n", q)
// fmt.Printf("Bind values: %#v\n", bind_values)
// Run the query
var results []CursorResult
err := p.DB.Select(&results, q, bind_values...)
if err != nil {
panic(err)
}
// Assemble the feed
ret := NewFeed()
for _, val := range results {
// fmt.Printf("\tResult: %#v\n", val)
ret.Tweets[val.Tweet.ID] = val.Tweet
if val.Retweet.RetweetID != 0 {
ret.Retweets[val.Retweet.RetweetID] = val.Retweet
}
ret.Items = append(ret.Items, FeedItem{TweetID: val.Tweet.ID, RetweetID: val.Retweet.RetweetID})
}
p.fill_content(&ret.TweetTrove, current_user_id)
// Set the new cursor position and value
ret.CursorBottom = c // Copy cursor values over
if len(results) < c.PageSize {
ret.CursorBottom.CursorPosition = CURSOR_END
} else {
ret.CursorBottom.CursorPosition = CURSOR_MIDDLE
last_item := results[len(results)-1]
ret.CursorBottom.CursorValue = c.SortOrder.NextCursorValue(last_item)
}
return ret, nil
}