Fix entity recognition to avoid matching email
This commit is contained in:
parent
f8cd326440
commit
8608f06bca
@ -44,3 +44,15 @@ func TestGetEntitiesHashtagAndMention(t *testing.T) {
|
|||||||
assert.Equal(entities[4].EntityType, ENTITY_TYPE_TEXT)
|
assert.Equal(entities[4].EntityType, ENTITY_TYPE_TEXT)
|
||||||
assert.Equal(entities[4].Contents, " in it")
|
assert.Equal(entities[4].Contents, " in it")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestGetEntitiesNoMatchEmail(t *testing.T) {
|
||||||
|
assert := assert.New(t)
|
||||||
|
require := require.New(t)
|
||||||
|
|
||||||
|
s := "My email is somebody@somedomain.com"
|
||||||
|
entities := get_entities(s)
|
||||||
|
|
||||||
|
require.Len(entities, 1)
|
||||||
|
assert.Equal(entities[0].EntityType, ENTITY_TYPE_TEXT)
|
||||||
|
assert.Equal(entities[0].Contents, s)
|
||||||
|
}
|
||||||
|
@ -167,7 +167,11 @@ type Entity struct {
|
|||||||
func get_entities(text string) []Entity {
|
func get_entities(text string) []Entity {
|
||||||
ret := []Entity{}
|
ret := []Entity{}
|
||||||
start := 0
|
start := 0
|
||||||
for _, idxs := range regexp.MustCompile(`[@#]\w+`).FindAllStringIndex(text, -1) {
|
for _, idxs := range regexp.MustCompile(`(\s|^)[@#]\w+`).FindAllStringIndex(text, -1) {
|
||||||
|
// Handle leading whitespace. Only match start-of-string or leading whitespace to avoid matching, e.g., emails
|
||||||
|
if text[idxs[0]] == ' ' || text[idxs[0]] == '\n' {
|
||||||
|
idxs[0] += 1
|
||||||
|
}
|
||||||
if start != idxs[0] {
|
if start != idxs[0] {
|
||||||
ret = append(ret, Entity{ENTITY_TYPE_TEXT, text[start:idxs[0]]})
|
ret = append(ret, Entity{ENTITY_TYPE_TEXT, text[start:idxs[0]]})
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user