Implement search query parsing (not yet hooked up to search bar)

This commit is contained in:
Alessio 2023-08-20 07:55:57 -03:00
parent addcf0ea52
commit fa33199489
3 changed files with 346 additions and 0 deletions

View File

@ -1,8 +1,10 @@
package persistence
import (
"errors"
"fmt"
"strings"
"time"
"gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
)
@ -114,6 +116,7 @@ type Cursor struct {
FilterImages Filter
FilterVideos Filter
FilterPolls Filter
FilterSpaces Filter
FilterReplies Filter
FilterRetweets Filter
FilterOfflineFollowed Filter
@ -167,6 +170,97 @@ func NewUserFeedCursor(h scraper.UserHandle) Cursor {
}
}
func NewCursorFromSearchQuery(q string) (Cursor, error) {
ret := NewCursor()
is_in_quotes := false
current_token := ""
for _, char := range q {
if char == ' ' && !is_in_quotes {
// Token is finished
if current_token == "" {
// Ignore empty tokens
continue
}
// Add the completed token
if err := ret.apply_token(current_token); err != nil {
return Cursor{}, err
}
current_token = ""
continue
}
if char == '"' {
if is_in_quotes {
is_in_quotes = false
if err := ret.apply_token(current_token); err != nil {
return Cursor{}, err
}
current_token = ""
continue
} else {
is_in_quotes = true
continue
}
}
// current_token = fmt.Sprintf("%s%s", current_token, char)
current_token += string(char)
}
// End of query string is reached
if is_in_quotes {
return Cursor{}, ErrUnmatchedQuotes
}
if current_token != "" {
if err := ret.apply_token(current_token); err != nil {
return Cursor{}, err
}
}
return ret, nil
}
var ErrInvalidQuery = errors.New("invalid search query")
var ErrUnmatchedQuotes = fmt.Errorf("%w (unmatched quotes)", ErrInvalidQuery)
func (c *Cursor) apply_token(token string) error {
parts := strings.Split(token, ":")
if len(parts) < 2 {
c.Keywords = append(c.Keywords, token)
return nil
}
var err error
switch parts[0] {
case "from":
c.FromUserHandle = scraper.UserHandle(parts[1])
case "to":
c.ToUserHandles = append(c.ToUserHandles, scraper.UserHandle(parts[1]))
case "retweeted_by":
c.RetweetedByUserHandle = scraper.UserHandle(parts[1])
case "since":
c.SinceTimestamp.Time, err = time.Parse("2006-01-02", parts[1])
case "until":
c.UntilTimestamp.Time, err = time.Parse("2006-01-02", parts[1])
case "filter":
switch parts[1] {
case "links":
c.FilterLinks = REQUIRE
case "images":
c.FilterImages = REQUIRE
case "videos":
c.FilterVideos = REQUIRE
case "polls":
c.FilterPolls = REQUIRE
case "spaces":
c.FilterSpaces = REQUIRE
}
}
if err != nil {
return fmt.Errorf("query token %q: %w", token, ErrInvalidQuery)
}
return nil
}
func (p Profile) NextPage(c Cursor) (Feed, error) {
where_clauses := []string{}
bind_values := []interface{}{}
@ -230,6 +324,12 @@ func (p Profile) NextPage(c Cursor) (Feed, error) {
case EXCLUDE:
where_clauses = append(where_clauses, "not exists (select 1 from polls where polls.tweet_id = tweets.id)")
}
switch c.FilterSpaces {
case REQUIRE:
where_clauses = append(where_clauses, "space_id != 0")
case EXCLUDE:
where_clauses = append(where_clauses, "space_id = 0")
}
// Filter by lists (e.g., offline-followed)
switch c.FilterOfflineFollowed {

View File

@ -0,0 +1,110 @@
package persistence_test
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"gitlab.com/offline-twitter/twitter_offline_engine/pkg/persistence"
. "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
)
func TestTokenizeSearchString(t *testing.T) {
require := require.New(t)
assert := assert.New(t)
c, err := persistence.NewCursorFromSearchQuery("think")
require.NoError(err)
assert.Len(c.Keywords, 1)
assert.Equal(c.Keywords[0], "think")
}
func TestTokenizeSearchStringMultipleWords(t *testing.T) {
require := require.New(t)
assert := assert.New(t)
c, err := persistence.NewCursorFromSearchQuery("think tank")
require.NoError(err)
assert.Len(c.Keywords, 2)
assert.Equal(c.Keywords[0], "think")
assert.Equal(c.Keywords[1], "tank")
}
func TestTokenizeSearchStringQuotedTokens(t *testing.T) {
require := require.New(t)
assert := assert.New(t)
c, err := persistence.NewCursorFromSearchQuery("\"think tank\"")
require.NoError(err)
assert.Len(c.Keywords, 1)
assert.Equal("think tank", c.Keywords[0])
}
func TestTokenizeSearchStringFromUser(t *testing.T) {
require := require.New(t)
assert := assert.New(t)
c, err := persistence.NewCursorFromSearchQuery("from:cernovich retweeted_by:blehbleh to:somebody")
require.NoError(err)
assert.Len(c.Keywords, 0)
assert.Equal(c.FromUserHandle, UserHandle("cernovich"))
assert.Equal(c.RetweetedByUserHandle, UserHandle("blehbleh"))
assert.Equal(c.ToUserHandles, []UserHandle{"somebody"})
}
func TestComplexSearchString(t *testing.T) {
require := require.New(t)
assert := assert.New(t)
c, err := persistence.NewCursorFromSearchQuery("stupid \"think tank\" from:kashi")
require.NoError(err)
assert.Len(c.Keywords, 2)
assert.Equal("stupid", c.Keywords[0])
assert.Equal("think tank", c.Keywords[1])
assert.Equal(c.FromUserHandle, UserHandle("kashi"))
}
func TestSearchStringBadQuotes(t *testing.T) {
require := require.New(t)
assert := assert.New(t)
_, err := persistence.NewCursorFromSearchQuery("asdf \"fjk")
require.Error(err)
assert.ErrorIs(err, persistence.ErrUnmatchedQuotes)
assert.ErrorIs(err, persistence.ErrInvalidQuery)
}
func TestSearchWithDates(t *testing.T) {
require := require.New(t)
assert := assert.New(t)
c, err := persistence.NewCursorFromSearchQuery("since:2020-01-01 until:2020-05-01")
require.NoError(err)
assert.Equal(c.SinceTimestamp.Time, time.Date(2020, 1, 1, 0, 0, 0, 0, time.UTC))
assert.Equal(c.UntilTimestamp.Time, time.Date(2020, 5, 1, 0, 0, 0, 0, time.UTC))
}
func TestSearchWithInvalidDates(t *testing.T) {
require := require.New(t)
assert := assert.New(t)
_, err := persistence.NewCursorFromSearchQuery("since:fawejk")
require.Error(err)
assert.ErrorIs(err, persistence.ErrInvalidQuery)
}
func TestSearchContentFilters(t *testing.T) {
require := require.New(t)
assert := assert.New(t)
c, err := persistence.NewCursorFromSearchQuery("filter:links filter:videos filter:images filter:polls filter:spaces")
require.NoError(err)
assert.Equal(c.FilterLinks, persistence.REQUIRE)
assert.Equal(c.FilterVideos, persistence.REQUIRE)
assert.Equal(c.FilterImages, persistence.REQUIRE)
assert.Equal(c.FilterPolls, persistence.REQUIRE)
assert.Equal(c.FilterSpaces, persistence.REQUIRE)
}

View File

@ -3,6 +3,8 @@ package persistence_test
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@ -130,3 +132,137 @@ func TestTimeline(t *testing.T) {
assert.Equal(feed.Items[3].RetweetID, TweetID(144919526660333333))
assert.Equal(feed.Items[4].TweetID, TweetID(1413658466795737091))
}
func TestKeywordSearch(t *testing.T) {
require := require.New(t)
assert := assert.New(t)
profile, err := persistence.LoadProfile("../../sample_data/profile")
require.NoError(err)
c := persistence.NewCursor()
// Multiple words without quotes
c.Keywords = []string{"who", "are"}
feed, err := profile.NextPage(c)
require.NoError(err)
assert.True(len(feed.Items) > 1)
// Add quotes
c.Keywords = []string{"who are"}
feed, err = profile.NextPage(c)
require.NoError(err)
assert.Len(feed.Items, 1)
assert.Equal(feed.Items[0].TweetID, TweetID(1261483383483293700))
// With gibberish (no matches)
c.Keywords = []string{"fasdfjkafsldfjsff"}
feed, err = profile.NextPage(c)
require.NoError(err)
assert.Len(feed.Items, 0)
}
func TestSearchReplyingToUser(t *testing.T) {
require := require.New(t)
assert := assert.New(t)
profile, err := persistence.LoadProfile("../../sample_data/profile")
require.NoError(err)
c := persistence.NewCursor()
// Replying to a user
c.ToUserHandles = []UserHandle{"spacex"}
feed, err := profile.NextPage(c)
require.NoError(err)
assert.Len(feed.Items, 2)
assert.Equal(feed.Items[0].TweetID, TweetID(1428951883058753537))
assert.Equal(feed.Items[1].TweetID, TweetID(1428939163961790466))
// Replying to two users
c.ToUserHandles = []UserHandle{"spacex", "covfefeanon"}
feed, err = profile.NextPage(c)
require.NoError(err)
assert.Len(feed.Items, 1)
assert.Equal(feed.Items[0].TweetID, TweetID(1428939163961790466))
}
func TestSearchDateFilters(t *testing.T) {
require := require.New(t)
assert := assert.New(t)
profile, err := persistence.LoadProfile("../../sample_data/profile")
require.NoError(err)
c := persistence.NewCursor()
c.SortOrder = persistence.SORT_ORDER_MOST_LIKES
// Since timestamp
c.SinceTimestamp.Time = time.Date(2021, 10, 1, 0, 0, 0, 0, time.UTC)
c.FromUserHandle = UserHandle("cernovich")
feed, err := profile.NextPage(c)
require.NoError(err)
assert.Len(feed.Items, 1)
assert.Equal(feed.Items[0].TweetID, TweetID(1453461248142495744))
// Until timestamp
c.SinceTimestamp = TimestampFromUnix(0)
c.UntilTimestamp.Time = time.Date(2021, 10, 1, 0, 0, 0, 0, time.UTC)
feed, err = profile.NextPage(c)
require.NoError(err)
assert.Len(feed.Items, 3)
assert.Equal(feed.Items[0].TweetID, TweetID(1439027915404939265))
assert.Equal(feed.Items[1].TweetID, TweetID(1439068749336748043))
assert.Equal(feed.Items[2].TweetID, TweetID(1439067163508150272))
}
func TestSearchMediaFilters(t *testing.T) {
require := require.New(t)
assert := assert.New(t)
profile, err := persistence.LoadProfile("../../sample_data/profile")
require.NoError(err)
// Links
c := persistence.NewCursor()
c.SortOrder = persistence.SORT_ORDER_MOST_LIKES
c.FilterLinks = persistence.REQUIRE
feed, err := profile.NextPage(c)
require.NoError(err)
assert.Len(feed.Items, 2)
assert.Equal(feed.Items[0].TweetID, TweetID(1438642143170646017))
assert.Equal(feed.Items[1].TweetID, TweetID(1413665734866186243))
// Images
c = persistence.NewCursor()
c.SortOrder = persistence.SORT_ORDER_MOST_LIKES
c.FilterImages = persistence.REQUIRE
feed, err = profile.NextPage(c)
require.NoError(err)
assert.Len(feed.Items, 2)
assert.Equal(feed.Items[0].TweetID, TweetID(1261483383483293700))
assert.Equal(feed.Items[1].TweetID, TweetID(1426669666928414720))
// Videos
c = persistence.NewCursor()
c.SortOrder = persistence.SORT_ORDER_MOST_LIKES
c.FilterVideos = persistence.REQUIRE
feed, err = profile.NextPage(c)
require.NoError(err)
assert.Len(feed.Items, 2)
assert.Equal(feed.Items[0].TweetID, TweetID(1426619468327882761))
assert.Equal(feed.Items[1].TweetID, TweetID(1453461248142495744))
// Polls
c = persistence.NewCursor()
c.FilterPolls = persistence.REQUIRE
feed, err = profile.NextPage(c)
require.NoError(err)
assert.Len(feed.Items, 1)
assert.Equal(feed.Items[0].TweetID, TweetID(1465534109573390348))
// Spaces
c = persistence.NewCursor()
c.FilterSpaces = persistence.REQUIRE
feed, err = profile.NextPage(c)
require.NoError(err)
assert.Len(feed.Items, 1)
assert.Equal(feed.Items[0].TweetID, TweetID(1624833173514293249))
}