From fa331994892cad965345ab79e88b3e1830f738a1 Mon Sep 17 00:00:00 2001 From: Alessio Date: Sun, 20 Aug 2023 07:55:57 -0300 Subject: [PATCH] Implement search query parsing (not yet hooked up to search bar) --- pkg/persistence/compound_ssf_queries.go | 100 +++++++++++++ .../compound_ssf_queries_parse_test.go | 110 ++++++++++++++ pkg/persistence/compound_ssf_queries_test.go | 136 ++++++++++++++++++ 3 files changed, 346 insertions(+) create mode 100644 pkg/persistence/compound_ssf_queries_parse_test.go diff --git a/pkg/persistence/compound_ssf_queries.go b/pkg/persistence/compound_ssf_queries.go index 66cb825..f5d1288 100644 --- a/pkg/persistence/compound_ssf_queries.go +++ b/pkg/persistence/compound_ssf_queries.go @@ -1,8 +1,10 @@ package persistence import ( + "errors" "fmt" "strings" + "time" "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" ) @@ -114,6 +116,7 @@ type Cursor struct { FilterImages Filter FilterVideos Filter FilterPolls Filter + FilterSpaces Filter FilterReplies Filter FilterRetweets Filter FilterOfflineFollowed Filter @@ -167,6 +170,97 @@ func NewUserFeedCursor(h scraper.UserHandle) Cursor { } } +func NewCursorFromSearchQuery(q string) (Cursor, error) { + ret := NewCursor() + is_in_quotes := false + current_token := "" + + for _, char := range q { + if char == ' ' && !is_in_quotes { + // Token is finished + if current_token == "" { + // Ignore empty tokens + continue + } + // Add the completed token + if err := ret.apply_token(current_token); err != nil { + return Cursor{}, err + } + current_token = "" + continue + } + + if char == '"' { + if is_in_quotes { + is_in_quotes = false + if err := ret.apply_token(current_token); err != nil { + return Cursor{}, err + } + current_token = "" + continue + } else { + is_in_quotes = true + continue + } + } + + // current_token = fmt.Sprintf("%s%s", current_token, char) + current_token += string(char) + } + + // End of query string is reached + if is_in_quotes { + return Cursor{}, ErrUnmatchedQuotes + } + if current_token != "" { + if err := ret.apply_token(current_token); err != nil { + return Cursor{}, err + } + } + return ret, nil +} + +var ErrInvalidQuery = errors.New("invalid search query") +var ErrUnmatchedQuotes = fmt.Errorf("%w (unmatched quotes)", ErrInvalidQuery) + +func (c *Cursor) apply_token(token string) error { + parts := strings.Split(token, ":") + if len(parts) < 2 { + c.Keywords = append(c.Keywords, token) + return nil + } + var err error + switch parts[0] { + case "from": + c.FromUserHandle = scraper.UserHandle(parts[1]) + case "to": + c.ToUserHandles = append(c.ToUserHandles, scraper.UserHandle(parts[1])) + case "retweeted_by": + c.RetweetedByUserHandle = scraper.UserHandle(parts[1]) + case "since": + c.SinceTimestamp.Time, err = time.Parse("2006-01-02", parts[1]) + case "until": + c.UntilTimestamp.Time, err = time.Parse("2006-01-02", parts[1]) + case "filter": + switch parts[1] { + case "links": + c.FilterLinks = REQUIRE + case "images": + c.FilterImages = REQUIRE + case "videos": + c.FilterVideos = REQUIRE + case "polls": + c.FilterPolls = REQUIRE + case "spaces": + c.FilterSpaces = REQUIRE + } + } + if err != nil { + return fmt.Errorf("query token %q: %w", token, ErrInvalidQuery) + } + return nil +} + func (p Profile) NextPage(c Cursor) (Feed, error) { where_clauses := []string{} bind_values := []interface{}{} @@ -230,6 +324,12 @@ func (p Profile) NextPage(c Cursor) (Feed, error) { case EXCLUDE: where_clauses = append(where_clauses, "not exists (select 1 from polls where polls.tweet_id = tweets.id)") } + switch c.FilterSpaces { + case REQUIRE: + where_clauses = append(where_clauses, "space_id != 0") + case EXCLUDE: + where_clauses = append(where_clauses, "space_id = 0") + } // Filter by lists (e.g., offline-followed) switch c.FilterOfflineFollowed { diff --git a/pkg/persistence/compound_ssf_queries_parse_test.go b/pkg/persistence/compound_ssf_queries_parse_test.go new file mode 100644 index 0000000..180186a --- /dev/null +++ b/pkg/persistence/compound_ssf_queries_parse_test.go @@ -0,0 +1,110 @@ +package persistence_test + +import ( + "testing" + + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "gitlab.com/offline-twitter/twitter_offline_engine/pkg/persistence" + . "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" +) + +func TestTokenizeSearchString(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + c, err := persistence.NewCursorFromSearchQuery("think") + require.NoError(err) + assert.Len(c.Keywords, 1) + assert.Equal(c.Keywords[0], "think") +} + +func TestTokenizeSearchStringMultipleWords(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + c, err := persistence.NewCursorFromSearchQuery("think tank") + require.NoError(err) + assert.Len(c.Keywords, 2) + assert.Equal(c.Keywords[0], "think") + assert.Equal(c.Keywords[1], "tank") +} + +func TestTokenizeSearchStringQuotedTokens(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + c, err := persistence.NewCursorFromSearchQuery("\"think tank\"") + require.NoError(err) + assert.Len(c.Keywords, 1) + assert.Equal("think tank", c.Keywords[0]) +} + +func TestTokenizeSearchStringFromUser(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + c, err := persistence.NewCursorFromSearchQuery("from:cernovich retweeted_by:blehbleh to:somebody") + require.NoError(err) + assert.Len(c.Keywords, 0) + assert.Equal(c.FromUserHandle, UserHandle("cernovich")) + assert.Equal(c.RetweetedByUserHandle, UserHandle("blehbleh")) + assert.Equal(c.ToUserHandles, []UserHandle{"somebody"}) +} + +func TestComplexSearchString(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + c, err := persistence.NewCursorFromSearchQuery("stupid \"think tank\" from:kashi") + require.NoError(err) + assert.Len(c.Keywords, 2) + assert.Equal("stupid", c.Keywords[0]) + assert.Equal("think tank", c.Keywords[1]) + assert.Equal(c.FromUserHandle, UserHandle("kashi")) +} + +func TestSearchStringBadQuotes(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + _, err := persistence.NewCursorFromSearchQuery("asdf \"fjk") + require.Error(err) + assert.ErrorIs(err, persistence.ErrUnmatchedQuotes) + assert.ErrorIs(err, persistence.ErrInvalidQuery) +} + +func TestSearchWithDates(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + c, err := persistence.NewCursorFromSearchQuery("since:2020-01-01 until:2020-05-01") + require.NoError(err) + assert.Equal(c.SinceTimestamp.Time, time.Date(2020, 1, 1, 0, 0, 0, 0, time.UTC)) + assert.Equal(c.UntilTimestamp.Time, time.Date(2020, 5, 1, 0, 0, 0, 0, time.UTC)) +} + +func TestSearchWithInvalidDates(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + _, err := persistence.NewCursorFromSearchQuery("since:fawejk") + require.Error(err) + assert.ErrorIs(err, persistence.ErrInvalidQuery) +} + +func TestSearchContentFilters(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + c, err := persistence.NewCursorFromSearchQuery("filter:links filter:videos filter:images filter:polls filter:spaces") + require.NoError(err) + assert.Equal(c.FilterLinks, persistence.REQUIRE) + assert.Equal(c.FilterVideos, persistence.REQUIRE) + assert.Equal(c.FilterImages, persistence.REQUIRE) + assert.Equal(c.FilterPolls, persistence.REQUIRE) + assert.Equal(c.FilterSpaces, persistence.REQUIRE) +} diff --git a/pkg/persistence/compound_ssf_queries_test.go b/pkg/persistence/compound_ssf_queries_test.go index 068e55d..faa0e17 100644 --- a/pkg/persistence/compound_ssf_queries_test.go +++ b/pkg/persistence/compound_ssf_queries_test.go @@ -3,6 +3,8 @@ package persistence_test import ( "testing" + "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -130,3 +132,137 @@ func TestTimeline(t *testing.T) { assert.Equal(feed.Items[3].RetweetID, TweetID(144919526660333333)) assert.Equal(feed.Items[4].TweetID, TweetID(1413658466795737091)) } + +func TestKeywordSearch(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + profile, err := persistence.LoadProfile("../../sample_data/profile") + require.NoError(err) + c := persistence.NewCursor() + + // Multiple words without quotes + c.Keywords = []string{"who", "are"} + feed, err := profile.NextPage(c) + require.NoError(err) + assert.True(len(feed.Items) > 1) + + // Add quotes + c.Keywords = []string{"who are"} + feed, err = profile.NextPage(c) + require.NoError(err) + assert.Len(feed.Items, 1) + assert.Equal(feed.Items[0].TweetID, TweetID(1261483383483293700)) + + // With gibberish (no matches) + c.Keywords = []string{"fasdfjkafsldfjsff"} + feed, err = profile.NextPage(c) + require.NoError(err) + assert.Len(feed.Items, 0) +} + +func TestSearchReplyingToUser(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + profile, err := persistence.LoadProfile("../../sample_data/profile") + require.NoError(err) + c := persistence.NewCursor() + + // Replying to a user + c.ToUserHandles = []UserHandle{"spacex"} + feed, err := profile.NextPage(c) + require.NoError(err) + assert.Len(feed.Items, 2) + assert.Equal(feed.Items[0].TweetID, TweetID(1428951883058753537)) + assert.Equal(feed.Items[1].TweetID, TweetID(1428939163961790466)) + + // Replying to two users + c.ToUserHandles = []UserHandle{"spacex", "covfefeanon"} + feed, err = profile.NextPage(c) + require.NoError(err) + assert.Len(feed.Items, 1) + assert.Equal(feed.Items[0].TweetID, TweetID(1428939163961790466)) +} + +func TestSearchDateFilters(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + profile, err := persistence.LoadProfile("../../sample_data/profile") + require.NoError(err) + c := persistence.NewCursor() + c.SortOrder = persistence.SORT_ORDER_MOST_LIKES + + // Since timestamp + c.SinceTimestamp.Time = time.Date(2021, 10, 1, 0, 0, 0, 0, time.UTC) + c.FromUserHandle = UserHandle("cernovich") + feed, err := profile.NextPage(c) + require.NoError(err) + assert.Len(feed.Items, 1) + assert.Equal(feed.Items[0].TweetID, TweetID(1453461248142495744)) + + // Until timestamp + c.SinceTimestamp = TimestampFromUnix(0) + c.UntilTimestamp.Time = time.Date(2021, 10, 1, 0, 0, 0, 0, time.UTC) + feed, err = profile.NextPage(c) + require.NoError(err) + assert.Len(feed.Items, 3) + assert.Equal(feed.Items[0].TweetID, TweetID(1439027915404939265)) + assert.Equal(feed.Items[1].TweetID, TweetID(1439068749336748043)) + assert.Equal(feed.Items[2].TweetID, TweetID(1439067163508150272)) +} + +func TestSearchMediaFilters(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + profile, err := persistence.LoadProfile("../../sample_data/profile") + require.NoError(err) + + // Links + c := persistence.NewCursor() + c.SortOrder = persistence.SORT_ORDER_MOST_LIKES + c.FilterLinks = persistence.REQUIRE + feed, err := profile.NextPage(c) + require.NoError(err) + assert.Len(feed.Items, 2) + assert.Equal(feed.Items[0].TweetID, TweetID(1438642143170646017)) + assert.Equal(feed.Items[1].TweetID, TweetID(1413665734866186243)) + + // Images + c = persistence.NewCursor() + c.SortOrder = persistence.SORT_ORDER_MOST_LIKES + c.FilterImages = persistence.REQUIRE + feed, err = profile.NextPage(c) + require.NoError(err) + assert.Len(feed.Items, 2) + assert.Equal(feed.Items[0].TweetID, TweetID(1261483383483293700)) + assert.Equal(feed.Items[1].TweetID, TweetID(1426669666928414720)) + + // Videos + c = persistence.NewCursor() + c.SortOrder = persistence.SORT_ORDER_MOST_LIKES + c.FilterVideos = persistence.REQUIRE + feed, err = profile.NextPage(c) + require.NoError(err) + assert.Len(feed.Items, 2) + assert.Equal(feed.Items[0].TweetID, TweetID(1426619468327882761)) + assert.Equal(feed.Items[1].TweetID, TweetID(1453461248142495744)) + + // Polls + c = persistence.NewCursor() + c.FilterPolls = persistence.REQUIRE + feed, err = profile.NextPage(c) + require.NoError(err) + assert.Len(feed.Items, 1) + assert.Equal(feed.Items[0].TweetID, TweetID(1465534109573390348)) + + // Spaces + c = persistence.NewCursor() + c.FilterSpaces = persistence.REQUIRE + feed, err = profile.NextPage(c) + require.NoError(err) + assert.Len(feed.Items, 1) + assert.Equal(feed.Items[0].TweetID, TweetID(1624833173514293249)) +}