From be39d1ab0382a0af14a1ec7c67ed6ef533ddfe7e Mon Sep 17 00:00:00 2001 From: Alessio Date: Fri, 11 Aug 2023 01:38:49 -0300 Subject: [PATCH] Add cursor queries --- pkg/persistence/compound_queries.go | 1 + pkg/persistence/compound_ssf_queries.go | 231 +++++++++++++++++++ pkg/persistence/compound_ssf_queries_test.go | 89 +++++++ 3 files changed, 321 insertions(+) create mode 100644 pkg/persistence/compound_ssf_queries.go create mode 100644 pkg/persistence/compound_ssf_queries_test.go diff --git a/pkg/persistence/compound_queries.go b/pkg/persistence/compound_queries.go index 670ab15..bac7e7b 100644 --- a/pkg/persistence/compound_queries.go +++ b/pkg/persistence/compound_queries.go @@ -287,6 +287,7 @@ type FeedItem struct { type Feed struct { Items []FeedItem TweetTrove + CursorBottom Cursor } func (f Feed) BottomTimestamp() Timestamp { diff --git a/pkg/persistence/compound_ssf_queries.go b/pkg/persistence/compound_ssf_queries.go new file mode 100644 index 0000000..f0ed66f --- /dev/null +++ b/pkg/persistence/compound_ssf_queries.go @@ -0,0 +1,231 @@ +package persistence + +import ( + "fmt" + "strings" + + "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" +) + +type SortOrder int + +const ( + SORT_ORDER_NEWEST SortOrder = iota + SORT_ORDER_OLDEST + SORT_ORDER_MOST_LIKES + SORT_ORDER_MOST_RETWEETS +) + +func (o SortOrder) OrderByClause() string { + switch o { + case SORT_ORDER_NEWEST: + return "order by chrono desc" + case SORT_ORDER_OLDEST: + return "order by chrono asc" + case SORT_ORDER_MOST_LIKES: + return "order by num_likes desc" + case SORT_ORDER_MOST_RETWEETS: + return "order by num_retweets desc" + default: + panic(fmt.Sprintf("Invalid sort order: %d", o)) + } +} +func (o SortOrder) PaginationWhereClause() string { + switch o { + case SORT_ORDER_NEWEST: + return "chrono < ?" + case SORT_ORDER_OLDEST: + return "chrono > ?" + case SORT_ORDER_MOST_LIKES: + return "num_likes < ?" + case SORT_ORDER_MOST_RETWEETS: + return "num_retweets < ?" + default: + panic(fmt.Sprintf("Invalid sort order: %d", o)) + } +} +func (o SortOrder) NextCursorValue(r CursorResult) int { + switch o { + case SORT_ORDER_NEWEST: + return r.Chrono + case SORT_ORDER_OLDEST: + return r.Chrono + case SORT_ORDER_MOST_LIKES: + return r.NumLikes + case SORT_ORDER_MOST_RETWEETS: + return r.NumRetweets + default: + panic(fmt.Sprintf("Invalid sort order: %d", o)) + } +} + +// Position in the feed (i.e., whether scrolling up/down is possible) +type CursorPosition int + +const ( + // This is the top of the feed; `cursor_position` is invalid; + CURSOR_START CursorPosition = iota + + // `cursor_position` indicates what should be on the next page; + CURSOR_MIDDLE + + // Bottom of the feed has been reached. Subsequent pages will all be empty + CURSOR_END +) + +type CursorResult struct { + scraper.Tweet + scraper.Retweet + Chrono int `db:"chrono"` +} + +type Cursor struct { + CursorPosition + CursorValue int + SortOrder + PageSize int + + // Search params + Keywords []string + FromUserHandle scraper.UserHandle + ToUserHandles []scraper.UserHandle + RetweetedByUserHandle scraper.UserHandle + SinceTimestamp scraper.Timestamp + UntilTimestamp scraper.Timestamp + FilterLinks bool + FilterImages bool + FilterVideos bool + FilterPolls bool +} + +func NewCursor() Cursor { + return Cursor{ + Keywords: []string{}, + ToUserHandles: []scraper.UserHandle{}, + SinceTimestamp: scraper.TimestampFromUnix(0), + UntilTimestamp: scraper.TimestampFromUnix(0), + CursorPosition: CURSOR_START, + CursorValue: 0, + SortOrder: SORT_ORDER_NEWEST, + PageSize: 50, + } +} + +func (p Profile) NextPage(c Cursor) (Feed, error) { + where_clauses := []string{} + bind_values := []interface{}{} + + // Keywords + for _, kw := range c.Keywords { + where_clauses = append(where_clauses, "text like ?") + bind_values = append(bind_values, fmt.Sprintf("%%%s%%", kw)) + } + + // From, to, and RT'd by user handles + if c.FromUserHandle != "" { + where_clauses = append(where_clauses, "user_id = (select id from users where handle like ?)") + bind_values = append(bind_values, c.FromUserHandle) + } + for _, to_user := range c.ToUserHandles { + where_clauses = append(where_clauses, "reply_mentions like ?") + bind_values = append(bind_values, fmt.Sprintf("%%%s%%", to_user)) + } + where_clauses = append(where_clauses, "retweeted_by = coalesce((select id from users where handle like ?), 0)") + bind_values = append(bind_values, c.RetweetedByUserHandle) + + // Since and until timestamps + if c.SinceTimestamp.Unix() != 0 { + where_clauses = append(where_clauses, "posted_at > ?") + bind_values = append(bind_values, c.SinceTimestamp) + } + if c.UntilTimestamp.Unix() != 0 { + where_clauses = append(where_clauses, "posted_at < ?") + bind_values = append(bind_values, c.UntilTimestamp) + } + + // Media filters + if c.FilterLinks { + where_clauses = append(where_clauses, "exists (select 1 from urls where urls.tweet_id = tweets.id)") + } + if c.FilterImages { + where_clauses = append(where_clauses, "exists (select 1 from images where images.tweet_id = tweets.id)") + } + if c.FilterVideos { + where_clauses = append(where_clauses, "exists (select 1 from videos where videos.tweet_id = tweets.id)") + } + if c.FilterPolls { + where_clauses = append(where_clauses, "exists (select 1 from polls where polls.tweet_id = tweets.id)") + } + + // Pagination + if c.CursorPosition != CURSOR_START { + where_clauses = append(where_clauses, c.SortOrder.PaginationWhereClause()) + bind_values = append(bind_values, c.CursorValue) + } + + where_clause := "where " + strings.Join(where_clauses, " and ") + + q := `select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, quoted_tweet_id, + mentions, reply_mentions, hashtags, ifnull(space_id, '') space_id, ifnull(tombstone_types.short_name, "") tombstone_type, + is_expandable, + is_stub, is_content_downloaded, is_conversation_scraped, last_scraped_at, + 0 tweet_id, 0 retweet_id, 0 retweeted_by, 0 retweeted_at, + posted_at chrono + from tweets + left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid + ` + where_clause + ` + + union + + select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to_id, quoted_tweet_id, + mentions, reply_mentions, hashtags, ifnull(space_id, '') space_id, ifnull(tombstone_types.short_name, "") tombstone_type, + is_expandable, + is_stub, is_content_downloaded, is_conversation_scraped, last_scraped_at, + tweet_id, retweet_id, retweeted_by, retweeted_at, + retweeted_at chrono + from retweets + left join tweets on retweets.tweet_id = tweets.id + left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid + ` + where_clause + ` + ` + c.SortOrder.OrderByClause() + ` + limit ?` + + bind_values = append(bind_values, bind_values...) + bind_values = append(bind_values, c.PageSize) + + // Run the query + var results []CursorResult + fmt.Printf("Query: %s\n", q) + fmt.Printf("Args: %#v\n", bind_values) + err := p.DB.Select(&results, q, bind_values...) + if err != nil { + panic(err) + } + + // Assemble the feed + ret := NewFeed() + for _, val := range results { + ret.Tweets[val.Tweet.ID] = val.Tweet + if val.Retweet.RetweetID != 0 { + ret.Retweets[val.Retweet.RetweetID] = val.Retweet + } + ret.Items = append(ret.Items, FeedItem{TweetID: val.Tweet.ID, RetweetID: val.Retweet.RetweetID}) + } + + // Set the new cursor position + last_item := results[len(results)-1] + next_cursor_value := c.SortOrder.NextCursorValue(last_item) + + p.fill_content(&ret.TweetTrove) + + ret.CursorBottom = c + ret.CursorBottom.CursorValue = next_cursor_value + + if len(results) < c.PageSize { + ret.CursorBottom.CursorPosition = CURSOR_END + } else { + ret.CursorBottom.CursorPosition = CURSOR_MIDDLE + } + + return ret, nil +} diff --git a/pkg/persistence/compound_ssf_queries_test.go b/pkg/persistence/compound_ssf_queries_test.go new file mode 100644 index 0000000..7009980 --- /dev/null +++ b/pkg/persistence/compound_ssf_queries_test.go @@ -0,0 +1,89 @@ +package persistence_test + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "gitlab.com/offline-twitter/twitter_offline_engine/pkg/persistence" + . "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" +) + +// Use a cursor, sort by newest +func TestCursorSearchByNewest(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + profile, err := persistence.LoadProfile("../../sample_data/profile") + require.NoError(err) + + c := persistence.NewCursor() + c.PageSize = 3 + c.Keywords = []string{"think"} + c.SortOrder = persistence.SORT_ORDER_NEWEST + + feed, err := profile.NextPage(c) + require.NoError(err) + + assert.Len(feed.Items, 3) + assert.Len(feed.Retweets, 0) + assert.Equal(feed.Items[0].TweetID, TweetID(1439067163508150272)) + assert.Equal(feed.Items[1].TweetID, TweetID(1439027915404939265)) + assert.Equal(feed.Items[2].TweetID, TweetID(1428939163961790466)) + + next_cursor := feed.CursorBottom + assert.Equal(next_cursor.CursorPosition, persistence.CURSOR_MIDDLE) + assert.Equal(next_cursor.SortOrder, c.SortOrder) + assert.Equal(next_cursor.Keywords, c.Keywords) + assert.Equal(next_cursor.PageSize, c.PageSize) + assert.Equal(next_cursor.CursorValue, 1629520619) + + feed, err = profile.NextPage(next_cursor) + require.NoError(err) + + assert.Len(feed.Items, 2) + assert.Len(feed.Retweets, 0) + assert.Equal(feed.Items[0].TweetID, TweetID(1413772782358433792)) + assert.Equal(feed.Items[1].TweetID, TweetID(1343633011364016128)) + + next_cursor = feed.CursorBottom + assert.Equal(next_cursor.CursorPosition, persistence.CURSOR_END) +} + +// Search retweets, sorted by oldest +func TestCursorSearchWithRetweets(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + profile, err := persistence.LoadProfile("../../sample_data/profile") + require.NoError(err) + + c := persistence.NewCursor() + c.PageSize = 3 + c.RetweetedByUserHandle = "cernovich" + c.SortOrder = persistence.SORT_ORDER_OLDEST + + feed, err := profile.NextPage(c) + require.NoError(err) + + assert.Len(feed.Items, 3) + assert.Len(feed.Retweets, 3) + assert.Equal(feed.Items[0].RetweetID, TweetID(1490100255987171332)) + assert.Equal(feed.Items[1].RetweetID, TweetID(1490119308692766723)) + assert.Equal(feed.Items[2].RetweetID, TweetID(1490135787144237058)) + + next_cursor := feed.CursorBottom + assert.Equal(next_cursor.CursorPosition, persistence.CURSOR_MIDDLE) + assert.Equal(next_cursor.SortOrder, c.SortOrder) + assert.Equal(next_cursor.Keywords, c.Keywords) + assert.Equal(next_cursor.PageSize, c.PageSize) + assert.Equal(next_cursor.CursorValue, 1644111031) + + feed, err = profile.NextPage(next_cursor) + require.NoError(err) + + assert.Len(feed.Items, 0) + next_cursor = feed.CursorBottom + assert.Equal(next_cursor.CursorPosition, persistence.CURSOR_END) +}