Fix entity recognition to avoid matching email

This commit is contained in:
Alessio 2023-08-27 15:33:22 -03:00
parent f8cd326440
commit 8608f06bca
2 changed files with 17 additions and 1 deletions

View File

@ -44,3 +44,15 @@ func TestGetEntitiesHashtagAndMention(t *testing.T) {
assert.Equal(entities[4].EntityType, ENTITY_TYPE_TEXT)
assert.Equal(entities[4].Contents, " in it")
}
func TestGetEntitiesNoMatchEmail(t *testing.T) {
assert := assert.New(t)
require := require.New(t)
s := "My email is somebody@somedomain.com"
entities := get_entities(s)
require.Len(entities, 1)
assert.Equal(entities[0].EntityType, ENTITY_TYPE_TEXT)
assert.Equal(entities[0].Contents, s)
}

View File

@ -167,7 +167,11 @@ type Entity struct {
func get_entities(text string) []Entity {
ret := []Entity{}
start := 0
for _, idxs := range regexp.MustCompile(`[@#]\w+`).FindAllStringIndex(text, -1) {
for _, idxs := range regexp.MustCompile(`(\s|^)[@#]\w+`).FindAllStringIndex(text, -1) {
// Handle leading whitespace. Only match start-of-string or leading whitespace to avoid matching, e.g., emails
if text[idxs[0]] == ' ' || text[idxs[0]] == '\n' {
idxs[0] += 1
}
if start != idxs[0] {
ret = append(ret, Entity{ENTITY_TYPE_TEXT, text[start:idxs[0]]})
}