From 5568a866518a3c3d553528ef59e193820c7a57dd Mon Sep 17 00:00:00 2001 From: Alessio Date: Fri, 11 Aug 2023 20:56:30 -0300 Subject: [PATCH] Improve expressiveness of Cursor querying, and implement Offline Timeline --- pkg/persistence/compound_queries.go | 4 +- pkg/persistence/compound_ssf_queries.go | 96 +++++++++++++++++--- pkg/persistence/compound_ssf_queries_test.go | 43 +++++++++ 3 files changed, 128 insertions(+), 15 deletions(-) diff --git a/pkg/persistence/compound_queries.go b/pkg/persistence/compound_queries.go index bac7e7b..a0d6ee2 100644 --- a/pkg/persistence/compound_queries.go +++ b/pkg/persistence/compound_queries.go @@ -68,7 +68,7 @@ func (p Profile) fill_content(trove *TweetTrove) { is_content_downloaded, is_followed from users where id in (` + strings.Repeat("?,", len(user_ids)-1) + `?)` - fmt.Printf("%s\n", userquery) + // fmt.Printf("%s\n", userquery) err := p.DB.Select(&users, userquery, user_ids...) if err != nil { panic(err) @@ -82,7 +82,7 @@ func (p Profile) fill_content(trove *TweetTrove) { var images []Image imgquery := ` select id, tweet_id, width, height, remote_url, local_filename, is_downloaded from images where tweet_id in (` + in_clause + `)` - fmt.Printf("%s\n", imgquery) // TODO: SQL logger + // fmt.Printf("%s\n", imgquery) // TODO: SQL logger err := p.DB.Select(&images, imgquery, tweet_ids...) if err != nil { panic(err) diff --git a/pkg/persistence/compound_ssf_queries.go b/pkg/persistence/compound_ssf_queries.go index 9ac11fb..a30c92f 100644 --- a/pkg/persistence/compound_ssf_queries.go +++ b/pkg/persistence/compound_ssf_queries.go @@ -73,10 +73,23 @@ const ( CURSOR_END ) +// Whether to require, exclude, or indifferent a type of content +type Filter int + +const ( + // Filter is not used + NONE Filter = iota + // All results must match the filter + REQUIRE + // Results must not match the filter + EXCLUDE +) + type CursorResult struct { scraper.Tweet scraper.Retweet - Chrono int `db:"chrono"` + Chrono int `db:"chrono"` + ByUserID scraper.UserID `db:"by_user_id"` } type Cursor struct { @@ -92,10 +105,13 @@ type Cursor struct { RetweetedByUserHandle scraper.UserHandle SinceTimestamp scraper.Timestamp UntilTimestamp scraper.Timestamp - FilterLinks bool - FilterImages bool - FilterVideos bool - FilterPolls bool + FilterLinks Filter + FilterImages Filter + FilterVideos Filter + FilterPolls Filter + FilterReplies Filter + FilterRetweets Filter + FilterOfflineFollowed Filter } func NewCursor() Cursor { @@ -108,6 +124,23 @@ func NewCursor() Cursor { CursorValue: 0, SortOrder: SORT_ORDER_NEWEST, PageSize: 50, + + FilterRetweets: EXCLUDE, + } +} + +func NewTimelineCursor() Cursor { + return Cursor{ + Keywords: []string{}, + ToUserHandles: []scraper.UserHandle{}, + SinceTimestamp: scraper.TimestampFromUnix(0), + UntilTimestamp: scraper.TimestampFromUnix(0), + CursorPosition: CURSOR_START, + CursorValue: 0, + SortOrder: SORT_ORDER_NEWEST, + PageSize: 50, + + FilterOfflineFollowed: REQUIRE, } } @@ -130,8 +163,10 @@ func (p Profile) NextPage(c Cursor) (Feed, error) { where_clauses = append(where_clauses, "reply_mentions like ?") bind_values = append(bind_values, fmt.Sprintf("%%%s%%", to_user)) } - where_clauses = append(where_clauses, "retweeted_by = coalesce((select id from users where handle like ?), 0)") - bind_values = append(bind_values, c.RetweetedByUserHandle) + if c.RetweetedByUserHandle != "" { + where_clauses = append(where_clauses, "retweeted_by = (select id from users where handle like ?)") + bind_values = append(bind_values, c.RetweetedByUserHandle) + } // Since and until timestamps if c.SinceTimestamp.Unix() != 0 { @@ -144,17 +179,49 @@ func (p Profile) NextPage(c Cursor) (Feed, error) { } // Media filters - if c.FilterLinks { + switch c.FilterLinks { + case REQUIRE: where_clauses = append(where_clauses, "exists (select 1 from urls where urls.tweet_id = tweets.id)") + case EXCLUDE: + where_clauses = append(where_clauses, "not exists (select 1 from urls where urls.tweet_id = tweets.id)") } - if c.FilterImages { + switch c.FilterImages { + case REQUIRE: where_clauses = append(where_clauses, "exists (select 1 from images where images.tweet_id = tweets.id)") + case EXCLUDE: + where_clauses = append(where_clauses, "not exists (select 1 from images where images.tweet_id = tweets.id)") } - if c.FilterVideos { + switch c.FilterVideos { + case REQUIRE: where_clauses = append(where_clauses, "exists (select 1 from videos where videos.tweet_id = tweets.id)") + case EXCLUDE: + where_clauses = append(where_clauses, "not exists (select 1 from videos where videos.tweet_id = tweets.id)") } - if c.FilterPolls { + switch c.FilterPolls { + case REQUIRE: where_clauses = append(where_clauses, "exists (select 1 from polls where polls.tweet_id = tweets.id)") + case EXCLUDE: + where_clauses = append(where_clauses, "not exists (select 1 from polls where polls.tweet_id = tweets.id)") + } + + // Filter by lists (e.g., offline-followed) + switch c.FilterOfflineFollowed { + case REQUIRE: + where_clauses = append(where_clauses, "by_user_id in (select id from users where is_followed = 1)") + case EXCLUDE: + where_clauses = append(where_clauses, "by_user_id not in (select id from users where is_followed = 1)") + } + switch c.FilterReplies { + case REQUIRE: + where_clauses = append(where_clauses, "in_reply_to_id != 0") + case EXCLUDE: + where_clauses = append(where_clauses, "in_reply_to_id = 0") + } + switch c.FilterRetweets { + case REQUIRE: + where_clauses = append(where_clauses, "retweet_id != 0") + case EXCLUDE: + where_clauses = append(where_clauses, "retweet_id = 0") } // Pagination @@ -170,7 +237,7 @@ func (p Profile) NextPage(c Cursor) (Feed, error) { is_expandable, is_stub, is_content_downloaded, is_conversation_scraped, last_scraped_at, 0 tweet_id, 0 retweet_id, 0 retweeted_by, 0 retweeted_at, - posted_at chrono + posted_at chrono, user_id by_user_id from tweets left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid ` + where_clause + ` @@ -182,7 +249,7 @@ func (p Profile) NextPage(c Cursor) (Feed, error) { is_expandable, is_stub, is_content_downloaded, is_conversation_scraped, last_scraped_at, tweet_id, retweet_id, retweeted_by, retweeted_at, - retweeted_at chrono + retweeted_at chrono, retweeted_by by_user_id from retweets left join tweets on retweets.tweet_id = tweets.id left join tombstone_types on tweets.tombstone_type = tombstone_types.rowid @@ -193,6 +260,8 @@ func (p Profile) NextPage(c Cursor) (Feed, error) { bind_values = append(bind_values, bind_values...) bind_values = append(bind_values, c.PageSize) + // fmt.Printf("Query: %s\n", q) + // fmt.Printf("Bind values: %#v\n", bind_values) // Run the query var results []CursorResult err := p.DB.Select(&results, q, bind_values...) @@ -203,6 +272,7 @@ func (p Profile) NextPage(c Cursor) (Feed, error) { // Assemble the feed ret := NewFeed() for _, val := range results { + // fmt.Printf("\tResult: %#v\n", val) ret.Tweets[val.Tweet.ID] = val.Tweet if val.Retweet.RetweetID != 0 { ret.Retweets[val.Retweet.RetweetID] = val.Retweet diff --git a/pkg/persistence/compound_ssf_queries_test.go b/pkg/persistence/compound_ssf_queries_test.go index 7009980..068e55d 100644 --- a/pkg/persistence/compound_ssf_queries_test.go +++ b/pkg/persistence/compound_ssf_queries_test.go @@ -62,6 +62,7 @@ func TestCursorSearchWithRetweets(t *testing.T) { c := persistence.NewCursor() c.PageSize = 3 c.RetweetedByUserHandle = "cernovich" + c.FilterRetweets = persistence.REQUIRE c.SortOrder = persistence.SORT_ORDER_OLDEST feed, err := profile.NextPage(c) @@ -87,3 +88,45 @@ func TestCursorSearchWithRetweets(t *testing.T) { next_cursor = feed.CursorBottom assert.Equal(next_cursor.CursorPosition, persistence.CURSOR_END) } + +// Offline Following Timeline +func TestTimeline(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + profile, err := persistence.LoadProfile("../../sample_data/profile") + require.NoError(err) + + c := persistence.NewTimelineCursor() + c.PageSize = 5 + + feed, err := profile.NextPage(c) + require.NoError(err) + + assert.Len(feed.Items, 5) + assert.Len(feed.Retweets, 4) + assert.Equal(feed.Items[0].RetweetID, TweetID(1490135787144237058)) + assert.Equal(feed.Items[1].RetweetID, TweetID(1490135787124232222)) + assert.Equal(feed.Items[2].RetweetID, TweetID(1490119308692766723)) + assert.Equal(feed.Items[3].RetweetID, TweetID(1490100255987171332)) + assert.Equal(feed.Items[4].TweetID, TweetID(1453461248142495744)) + + next_cursor := feed.CursorBottom + assert.Equal(next_cursor.CursorPosition, persistence.CURSOR_MIDDLE) + assert.Equal(next_cursor.SortOrder, c.SortOrder) + assert.Equal(next_cursor.Keywords, c.Keywords) + assert.Equal(next_cursor.PageSize, c.PageSize) + assert.Equal(next_cursor.CursorValue, 1635367140) + + next_cursor.CursorValue = 1631935323 // Scroll down a bit, kind of randomly + feed, err = profile.NextPage(next_cursor) + require.NoError(err) + + assert.Len(feed.Items, 5) + assert.Len(feed.Retweets, 1) + assert.Equal(feed.Items[0].TweetID, TweetID(1439027915404939265)) + assert.Equal(feed.Items[1].TweetID, TweetID(1413773185296650241)) + assert.Equal(feed.Items[2].TweetID, TweetID(1413664406995566593)) + assert.Equal(feed.Items[3].RetweetID, TweetID(144919526660333333)) + assert.Equal(feed.Items[4].TweetID, TweetID(1413658466795737091)) +}