424 lines
11 KiB
Go
424 lines
11 KiB
Go
package persistence
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"strings"
|
|
"time"
|
|
|
|
"gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
|
|
)
|
|
|
|
type SortOrder int
|
|
|
|
const (
|
|
SORT_ORDER_NEWEST SortOrder = iota
|
|
SORT_ORDER_OLDEST
|
|
SORT_ORDER_MOST_LIKES
|
|
SORT_ORDER_MOST_RETWEETS
|
|
)
|
|
|
|
func (o SortOrder) OrderByClause() string {
|
|
switch o {
|
|
case SORT_ORDER_NEWEST:
|
|
return "order by chrono desc"
|
|
case SORT_ORDER_OLDEST:
|
|
return "order by chrono asc"
|
|
case SORT_ORDER_MOST_LIKES:
|
|
return "order by num_likes desc"
|
|
case SORT_ORDER_MOST_RETWEETS:
|
|
return "order by num_retweets desc"
|
|
default:
|
|
panic(fmt.Sprintf("Invalid sort order: %d", o))
|
|
}
|
|
}
|
|
func (o SortOrder) PaginationWhereClause() string {
|
|
switch o {
|
|
case SORT_ORDER_NEWEST:
|
|
return "chrono < ?"
|
|
case SORT_ORDER_OLDEST:
|
|
return "chrono > ?"
|
|
case SORT_ORDER_MOST_LIKES:
|
|
return "num_likes < ?"
|
|
case SORT_ORDER_MOST_RETWEETS:
|
|
return "num_retweets < ?"
|
|
default:
|
|
panic(fmt.Sprintf("Invalid sort order: %d", o))
|
|
}
|
|
}
|
|
func (o SortOrder) NextCursorValue(r CursorResult) int {
|
|
switch o {
|
|
case SORT_ORDER_NEWEST:
|
|
return r.Chrono
|
|
case SORT_ORDER_OLDEST:
|
|
return r.Chrono
|
|
case SORT_ORDER_MOST_LIKES:
|
|
return r.NumLikes
|
|
case SORT_ORDER_MOST_RETWEETS:
|
|
return r.NumRetweets
|
|
default:
|
|
panic(fmt.Sprintf("Invalid sort order: %d", o))
|
|
}
|
|
}
|
|
|
|
// Position in the feed (i.e., whether scrolling up/down is possible)
|
|
type CursorPosition int
|
|
|
|
const (
|
|
// This is the top of the feed; `cursor_position` is invalid;
|
|
CURSOR_START CursorPosition = iota
|
|
|
|
// `cursor_position` indicates what should be on the next page;
|
|
CURSOR_MIDDLE
|
|
|
|
// Bottom of the feed has been reached. Subsequent pages will all be empty
|
|
CURSOR_END
|
|
)
|
|
|
|
func (c CursorPosition) IsEnd() bool {
|
|
return c == CURSOR_END
|
|
}
|
|
|
|
// Whether to require, exclude, or indifferent a type of content
|
|
type Filter int
|
|
|
|
const (
|
|
// Filter is not used
|
|
NONE Filter = iota
|
|
// All results must match the filter
|
|
REQUIRE
|
|
// Results must not match the filter
|
|
EXCLUDE
|
|
)
|
|
|
|
type CursorResult struct {
|
|
scraper.Tweet
|
|
scraper.Retweet
|
|
Chrono int `db:"chrono"`
|
|
ByUserID scraper.UserID `db:"by_user_id"`
|
|
}
|
|
|
|
type Cursor struct {
|
|
CursorPosition
|
|
CursorValue int
|
|
SortOrder
|
|
PageSize int
|
|
|
|
// Search params
|
|
Keywords []string
|
|
FromUserHandle scraper.UserHandle
|
|
RetweetedByUserHandle scraper.UserHandle
|
|
ByUserHandle scraper.UserHandle
|
|
ToUserHandles []scraper.UserHandle
|
|
SinceTimestamp scraper.Timestamp
|
|
UntilTimestamp scraper.Timestamp
|
|
FilterLinks Filter
|
|
FilterImages Filter
|
|
FilterVideos Filter
|
|
FilterPolls Filter
|
|
FilterSpaces Filter
|
|
FilterReplies Filter
|
|
FilterRetweets Filter
|
|
FilterOfflineFollowed Filter
|
|
}
|
|
|
|
// Generate a cursor with some reasonable defaults
|
|
func NewCursor() Cursor {
|
|
return Cursor{
|
|
Keywords: []string{},
|
|
ToUserHandles: []scraper.UserHandle{},
|
|
SinceTimestamp: scraper.TimestampFromUnix(0),
|
|
UntilTimestamp: scraper.TimestampFromUnix(0),
|
|
CursorPosition: CURSOR_START,
|
|
CursorValue: 0,
|
|
SortOrder: SORT_ORDER_NEWEST,
|
|
PageSize: 50,
|
|
|
|
FilterRetweets: EXCLUDE,
|
|
}
|
|
}
|
|
|
|
// Generate a cursor appropriate for fetching the Offline Timeline
|
|
func NewTimelineCursor() Cursor {
|
|
return Cursor{
|
|
Keywords: []string{},
|
|
ToUserHandles: []scraper.UserHandle{},
|
|
SinceTimestamp: scraper.TimestampFromUnix(0),
|
|
UntilTimestamp: scraper.TimestampFromUnix(0),
|
|
CursorPosition: CURSOR_START,
|
|
CursorValue: 0,
|
|
SortOrder: SORT_ORDER_NEWEST,
|
|
PageSize: 50,
|
|
|
|
FilterOfflineFollowed: REQUIRE,
|
|
}
|
|
}
|
|
|
|
// Generate a cursor appropriate for fetching a User Feed
|
|
func NewUserFeedCursor(h scraper.UserHandle) Cursor {
|
|
return Cursor{
|
|
Keywords: []string{},
|
|
ToUserHandles: []scraper.UserHandle{},
|
|
SinceTimestamp: scraper.TimestampFromUnix(0),
|
|
UntilTimestamp: scraper.TimestampFromUnix(0),
|
|
CursorPosition: CURSOR_START,
|
|
CursorValue: 0,
|
|
SortOrder: SORT_ORDER_NEWEST,
|
|
PageSize: 50,
|
|
|
|
ByUserHandle: h,
|
|
}
|
|
}
|
|
|
|
func NewCursorFromSearchQuery(q string) (Cursor, error) {
|
|
ret := NewCursor()
|
|
is_in_quotes := false
|
|
current_token := ""
|
|
|
|
for _, char := range q {
|
|
if char == ' ' && !is_in_quotes {
|
|
// Token is finished
|
|
if current_token == "" {
|
|
// Ignore empty tokens
|
|
continue
|
|
}
|
|
// Add the completed token
|
|
if err := ret.apply_token(current_token); err != nil {
|
|
return Cursor{}, err
|
|
}
|
|
current_token = ""
|
|
continue
|
|
}
|
|
|
|
if char == '"' {
|
|
if is_in_quotes {
|
|
is_in_quotes = false
|
|
if err := ret.apply_token(current_token); err != nil {
|
|
return Cursor{}, err
|
|
}
|
|
current_token = ""
|
|
continue
|
|
} else {
|
|
is_in_quotes = true
|
|
continue
|
|
}
|
|
}
|
|
|
|
// current_token = fmt.Sprintf("%s%s", current_token, char)
|
|
current_token += string(char)
|
|
}
|
|
|
|
// End of query string is reached
|
|
if is_in_quotes {
|
|
return Cursor{}, ErrUnmatchedQuotes
|
|
}
|
|
if current_token != "" {
|
|
if err := ret.apply_token(current_token); err != nil {
|
|
return Cursor{}, err
|
|
}
|
|
}
|
|
return ret, nil
|
|
}
|
|
|
|
var ErrInvalidQuery = errors.New("invalid search query")
|
|
var ErrUnmatchedQuotes = fmt.Errorf("%w (unmatched quotes)", ErrInvalidQuery)
|
|
|
|
func (c *Cursor) apply_token(token string) error {
|
|
parts := strings.Split(token, ":")
|
|
if len(parts) < 2 {
|
|
c.Keywords = append(c.Keywords, token)
|
|
return nil
|
|
}
|
|
var err error
|
|
switch parts[0] {
|
|
case "from":
|
|
c.FromUserHandle = scraper.UserHandle(parts[1])
|
|
case "to":
|
|
c.ToUserHandles = append(c.ToUserHandles, scraper.UserHandle(parts[1]))
|
|
case "retweeted_by":
|
|
c.RetweetedByUserHandle = scraper.UserHandle(parts[1])
|
|
case "since":
|
|
c.SinceTimestamp.Time, err = time.Parse("2006-01-02", parts[1])
|
|
case "until":
|
|
c.UntilTimestamp.Time, err = time.Parse("2006-01-02", parts[1])
|
|
case "filter":
|
|
switch parts[1] {
|
|
case "links":
|
|
c.FilterLinks = REQUIRE
|
|
case "images":
|
|
c.FilterImages = REQUIRE
|
|
case "videos":
|
|
c.FilterVideos = REQUIRE
|
|
case "polls":
|
|
c.FilterPolls = REQUIRE
|
|
case "spaces":
|
|
c.FilterSpaces = REQUIRE
|
|
}
|
|
}
|
|
if err != nil {
|
|
return fmt.Errorf("query token %q: %w", token, ErrInvalidQuery)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (p Profile) NextPage(c Cursor) (Feed, error) {
|
|
where_clauses := []string{}
|
|
bind_values := []interface{}{}
|
|
|
|
// Keywords
|
|
for _, kw := range c.Keywords {
|
|
where_clauses = append(where_clauses, "text like ?")
|
|
bind_values = append(bind_values, fmt.Sprintf("%%%s%%", kw))
|
|
}
|
|
|
|
// From, to, by, and RT'd by user handles
|
|
if c.FromUserHandle != "" {
|
|
where_clauses = append(where_clauses, "user_id = (select id from users where handle like ?)")
|
|
bind_values = append(bind_values, c.FromUserHandle)
|
|
}
|
|
for _, to_user := range c.ToUserHandles {
|
|
where_clauses = append(where_clauses, "reply_mentions like ?")
|
|
bind_values = append(bind_values, fmt.Sprintf("%%%s%%", to_user))
|
|
}
|
|
if c.RetweetedByUserHandle != "" {
|
|
where_clauses = append(where_clauses, "retweeted_by = (select id from users where handle like ?)")
|
|
bind_values = append(bind_values, c.RetweetedByUserHandle)
|
|
}
|
|
if c.ByUserHandle != "" {
|
|
where_clauses = append(where_clauses, "by_user_id = (select id from users where handle like ?)")
|
|
bind_values = append(bind_values, c.ByUserHandle)
|
|
}
|
|
|
|
// Since and until timestamps
|
|
if c.SinceTimestamp.Unix() != 0 {
|
|
where_clauses = append(where_clauses, "posted_at > ?")
|
|
bind_values = append(bind_values, c.SinceTimestamp)
|
|
}
|
|
if c.UntilTimestamp.Unix() != 0 {
|
|
where_clauses = append(where_clauses, "posted_at < ?")
|
|
bind_values = append(bind_values, c.UntilTimestamp)
|
|
}
|
|
|
|
// Media filters
|
|
switch c.FilterLinks {
|
|
case REQUIRE:
|
|
where_clauses = append(where_clauses, "exists (select 1 from urls where urls.tweet_id = tweets.id)")
|
|
case EXCLUDE:
|
|
where_clauses = append(where_clauses, "not exists (select 1 from urls where urls.tweet_id = tweets.id)")
|
|
}
|
|
switch c.FilterImages {
|
|
case REQUIRE:
|
|
where_clauses = append(where_clauses, "exists (select 1 from images where images.tweet_id = tweets.id)")
|
|
case EXCLUDE:
|
|
where_clauses = append(where_clauses, "not exists (select 1 from images where images.tweet_id = tweets.id)")
|
|
}
|
|
switch c.FilterVideos {
|
|
case REQUIRE:
|
|
where_clauses = append(where_clauses, "exists (select 1 from videos where videos.tweet_id = tweets.id)")
|
|
case EXCLUDE:
|
|
where_clauses = append(where_clauses, "not exists (select 1 from videos where videos.tweet_id = tweets.id)")
|
|
}
|
|
switch c.FilterPolls {
|
|
case REQUIRE:
|
|
where_clauses = append(where_clauses, "exists (select 1 from polls where polls.tweet_id = tweets.id)")
|
|
case EXCLUDE:
|
|
where_clauses = append(where_clauses, "not exists (select 1 from polls where polls.tweet_id = tweets.id)")
|
|
}
|
|
switch c.FilterSpaces {
|
|
case REQUIRE:
|
|
where_clauses = append(where_clauses, "space_id != 0")
|
|
case EXCLUDE:
|
|
where_clauses = append(where_clauses, "space_id = 0")
|
|
}
|
|
|
|
// Filter by lists (e.g., offline-followed)
|
|
switch c.FilterOfflineFollowed {
|
|
case REQUIRE:
|
|
where_clauses = append(where_clauses, "by_user_id in (select id from users where is_followed = 1)")
|
|
case EXCLUDE:
|
|
where_clauses = append(where_clauses, "by_user_id not in (select id from users where is_followed = 1)")
|
|
}
|
|
switch c.FilterReplies {
|
|
case REQUIRE:
|
|
where_clauses = append(where_clauses, "in_reply_to_id != 0")
|
|
case EXCLUDE:
|
|
where_clauses = append(where_clauses, "in_reply_to_id = 0")
|
|
}
|
|
switch c.FilterRetweets {
|
|
case REQUIRE:
|
|
where_clauses = append(where_clauses, "retweet_id != 0")
|
|
case EXCLUDE:
|
|
where_clauses = append(where_clauses, "retweet_id = 0")
|
|
}
|
|
|
|
// Pagination
|
|
if c.CursorPosition != CURSOR_START {
|
|
where_clauses = append(where_clauses, c.SortOrder.PaginationWhereClause())
|
|
bind_values = append(bind_values, c.CursorValue)
|
|
}
|
|
|
|
where_clause := "where " + strings.Join(where_clauses, " and ")
|
|
|
|
q := `select * from (
|
|
select ` + TWEETS_ALL_SQL_FIELDS + `,
|
|
0 tweet_id, 0 retweet_id, 0 retweeted_by, 0 retweeted_at,
|
|
posted_at chrono, user_id by_user_id
|
|
from tweets
|
|
left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid
|
|
` + where_clause + ` ` + c.SortOrder.OrderByClause() + ` limit ?
|
|
)
|
|
|
|
union
|
|
|
|
select * from (
|
|
select ` + TWEETS_ALL_SQL_FIELDS + `,
|
|
tweet_id, retweet_id, retweeted_by, retweeted_at,
|
|
retweeted_at chrono, retweeted_by by_user_id
|
|
from retweets
|
|
left join tweets on retweets.tweet_id = tweets.id
|
|
left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid
|
|
` + where_clause + `
|
|
` + c.SortOrder.OrderByClause() + `
|
|
limit ?
|
|
) ` + c.SortOrder.OrderByClause() + ` limit ?`
|
|
|
|
bind_values = append(bind_values, c.PageSize)
|
|
bind_values = append(bind_values, bind_values...)
|
|
bind_values = append(bind_values, c.PageSize)
|
|
|
|
// fmt.Printf("Query: %s\n", q)
|
|
// fmt.Printf("Bind values: %#v\n", bind_values)
|
|
// Run the query
|
|
var results []CursorResult
|
|
err := p.DB.Select(&results, q, bind_values...)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
// Assemble the feed
|
|
ret := NewFeed()
|
|
for _, val := range results {
|
|
// fmt.Printf("\tResult: %#v\n", val)
|
|
ret.Tweets[val.Tweet.ID] = val.Tweet
|
|
if val.Retweet.RetweetID != 0 {
|
|
ret.Retweets[val.Retweet.RetweetID] = val.Retweet
|
|
}
|
|
ret.Items = append(ret.Items, FeedItem{TweetID: val.Tweet.ID, RetweetID: val.Retweet.RetweetID})
|
|
}
|
|
|
|
p.fill_content(&ret.TweetTrove)
|
|
|
|
ret.CursorBottom = c
|
|
|
|
// Set the new cursor position and value
|
|
if len(results) < c.PageSize {
|
|
ret.CursorBottom.CursorPosition = CURSOR_END
|
|
} else {
|
|
ret.CursorBottom.CursorPosition = CURSOR_MIDDLE
|
|
last_item := results[len(results)-1]
|
|
ret.CursorBottom.CursorValue = c.SortOrder.NextCursorValue(last_item)
|
|
}
|
|
|
|
return ret, nil
|
|
}
|