Fix handling of reply-mentions and tweet text normalization

This commit is contained in:
Alessio 2021-09-27 18:12:28 -07:00
parent 9ae6213025
commit 159084006d
6 changed files with 96 additions and 46 deletions

View File

@ -74,6 +74,7 @@ type APITweet struct {
CreatedAt string `json:"created_at"` CreatedAt string `json:"created_at"`
FavoriteCount int `json:"favorite_count"` FavoriteCount int `json:"favorite_count"`
FullText string `json:"full_text"` FullText string `json:"full_text"`
DisplayTextRange []int `json:"display_text_range"`
Entities struct { Entities struct {
Hashtags []struct { Hashtags []struct {
Text string `json:"text"` Text string `json:"text"`
@ -87,6 +88,7 @@ type APITweet struct {
UserName string `json:"screen_name"` UserName string `json:"screen_name"`
UserID int64 `json:"id_str,string"` UserID int64 `json:"id_str,string"`
} `json:"user_mentions"` } `json:"user_mentions"`
ReplyMentions string // The leading part of the text which is cut off by "DisplayTextRange"
} `json:"entities"` } `json:"entities"`
ExtendedEntities struct { ExtendedEntities struct {
Media []APIExtendedMedia `json:"media"` Media []APIExtendedMedia `json:"media"`
@ -100,33 +102,16 @@ type APITweet struct {
RetweetedStatusID int64 RetweetedStatusID int64
QuotedStatusIDStr string `json:"quoted_status_id_str"` // Can be empty string QuotedStatusIDStr string `json:"quoted_status_id_str"` // Can be empty string
QuotedStatusID int64 QuotedStatusID int64
QuotedStatusPermalink struct {
URL string `json:"url"`
ExpandedURL string `json:"expanded"`
} `json:"quoted_status_permalink"`
Time time.Time `json:"time"` Time time.Time `json:"time"`
UserID int64 `json:"user_id_str,string"` UserID int64 `json:"user_id_str,string"`
Card APICard `json:"card"` Card APICard `json:"card"`
} }
func (t *APITweet) NormalizeContent() { func (t *APITweet) NormalizeContent() {
// Remove embedded links at the end of the text
if len(t.Entities.URLs) == 1 { // TODO: should this be `>= 1`, like below?
url := t.Entities.URLs[0].ShortenedUrl
if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the newline
}
}
if len(t.Entities.Media) >= 1 {
url := t.Entities.Media[0].URL
if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the trailing space
}
}
// Remove leading `@username` for replies
if t.InReplyToScreenName != "" {
if strings.Index(t.FullText, "@" + t.InReplyToScreenName) == 0 {
t.FullText = t.FullText[len(t.InReplyToScreenName) + 1:] // `@`, username, space
}
}
t.FullText = strings.TrimSpace(t.FullText)
id, err := strconv.Atoi(t.QuotedStatusIDStr) id, err := strconv.Atoi(t.QuotedStatusIDStr)
if err == nil { if err == nil {
t.QuotedStatusID = int64(id) t.QuotedStatusID = int64(id)
@ -135,6 +120,27 @@ func (t *APITweet) NormalizeContent() {
if err == nil { if err == nil {
t.RetweetedStatusID = int64(id) t.RetweetedStatusID = int64(id)
} }
if (len(t.DisplayTextRange) == 2) {
t.Entities.ReplyMentions = strings.TrimSpace(string([]rune(t.FullText)[0:t.DisplayTextRange[0]]))
t.FullText = string([]rune(t.FullText)[t.DisplayTextRange[0]:t.DisplayTextRange[1]])
}
// Handle threads
if (t.InReplyToScreenName != "" && t.Entities.ReplyMentions == "") {
// Identify a "thread" as a tweet that replies to something but there's no leading `@reply` text
t.Entities.ReplyMentions = "@" + t.InReplyToScreenName
}
// Handle pasted tweet links that turn into quote tweets but still have a link in them
if t.QuotedStatusID != 0 {
for _, url := range t.Entities.URLs {
if url.ShortenedUrl == t.QuotedStatusPermalink.URL {
t.FullText = strings.ReplaceAll(t.FullText, url.ShortenedUrl, "")
}
}
}
t.FullText = strings.TrimSpace(t.FullText)
} }
func (t APITweet) String() string { func (t APITweet) String() string {

View File

@ -16,15 +16,19 @@ func TestNormalizeContent(t *testing.T) {
quoted_status_id scraper.TweetID quoted_status_id scraper.TweetID
in_reply_to scraper.TweetID in_reply_to scraper.TweetID
retweeted_status_id scraper.TweetID retweeted_status_id scraper.TweetID
reply_mentions string
} { } {
{"test_responses/single_tweets/tweet_that_is_a_reply_with_gif.json", "", 0, 1395882872729477131, 0}, {"test_responses/single_tweets/tweet_that_is_a_reply_with_gif.json", "", 0, 1395882872729477131, 0, "@michaelmalice"},
{"test_responses/single_tweets/tweet_with_image.json", "this saddens me every time", 0, 0, 0}, {"test_responses/single_tweets/tweet_with_image.json", "this saddens me every time", 0, 0, 0, ""},
{"test_responses/single_tweets/tweet_that_is_a_reply.json", "Noted", 0, 1396194494710788100, 0}, {"test_responses/single_tweets/tweet_that_is_a_reply.json", "Noted", 0, 1396194494710788100, 0, "@RvaTeddy @michaelmalice"},
{"test_responses/single_tweets/tweet_with_4_images.json", "These are public health officials who are making decisions about your lifestyle because they know more about health, fitness and well-being than you do", 0, 0, 0}, {"test_responses/single_tweets/tweet_with_4_images.json", "These are public health officials who are making decisions about your lifestyle because they know more about health, fitness and well-being than you do", 0, 0, 0, ""},
{"test_responses/single_tweets/tweet_with_quoted_tweet.json", "", 1422680899670274048, 0, 0}, {"test_responses/single_tweets/tweet_with_at_mentions_in_front.json", "It always does, doesn't it?", 0, 1428907275532476416, 0, "@rob_mose @primalpoly @jmasseypoet @SpaceX"},
{"test_responses/single_tweets/tweet_with_at_mentions_in_front.json", "It always does, doesn't it?", 0, 1428907275532476416, 0}, {"test_responses/single_tweets/tweet_with_unicode_chars.json", "The fact that @michaelmalice new book The Anarchist Handbook is just absolutely destroying on the charts is the largest white pill Ive swallowed in years.", 0, 0, 0, ""},
{"test_responses/single_tweets/tweet_with_unicode_chars.json", "The fact that @michaelmalice new book The Anarchist Handbook is just absolutely destroying on the charts is the largest white pill Ive swallowed in years.", 0, 0, 0}, {"test_responses/single_tweets/tweet_with_quoted_tweet_as_link.json", "", 1422680899670274048, 0, 0, ""},
{"test_responses/single_tweets/tweet_with_quoted_tweet_as_link2.json", "sometimes they're too dimwitted to even get the wrong title right", 1396194494710788100, 1395882872729477131, 0, "@michaelmalice"},
{"test_responses/single_tweets/tweet_with_quoted_tweet_as_link3.json", "I was using an analogy about creating out-groups but the Germans sure love their literalism", 1442092399358930946, 1335678942020300802, 0, "@michaelmalice"},
} }
for _, v := range test_cases { for _, v := range test_cases {
data, err := ioutil.ReadFile(v.filename) data, err := ioutil.ReadFile(v.filename)
if err != nil { if err != nil {
@ -51,6 +55,9 @@ func TestNormalizeContent(t *testing.T) {
if scraper.TweetID(tweet.RetweetedStatusID) != v.retweeted_status_id { if scraper.TweetID(tweet.RetweetedStatusID) != v.retweeted_status_id {
t.Errorf("Expected retweeted status id %d, but got %d", v.retweeted_status_id, tweet.RetweetedStatusID) t.Errorf("Expected retweeted status id %d, but got %d", v.retweeted_status_id, tweet.RetweetedStatusID)
} }
if tweet.Entities.ReplyMentions != v.reply_mentions {
t.Errorf("Expected @reply mentions to be %q, but it was %q", v.reply_mentions, tweet.Entities.ReplyMentions)
}
} }
} }

View File

@ -0,0 +1 @@
{"created_at":"Sun Sep 26 19:32:46 +0000 2021","id_str":"1442210557046792199","full_text":"I was using an analogy about creating out-groups but the Germans sure love their literalism\n\nhttps://t.co/dCMA90L72V","display_text_range":[0,116],"entities":{"urls":[{"url":"https://t.co/dCMA90L72V","expanded_url":"https://twitter.com/CJHopkins_Z23/status/1442092399358930946","display_url":"twitter.com/CJHopkins_Z23/…","indices":[93,116]}]},"source":"<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>","in_reply_to_status_id_str":"1335678942020300802","in_reply_to_user_id_str":"44067298","in_reply_to_screen_name":"michaelmalice","user_id_str":"44067298","is_quote_status":true,"quoted_status_id_str":"1442092399358930946","quoted_status_permalink":{"url":"https://t.co/dCMA90L72V","expanded":"https://twitter.com/CJHopkins_Z23/status/1442092399358930946","display":"twitter.com/CJHopkins_Z23/…"},"retweet_count":36,"favorite_count":386,"reply_count":12,"quote_count":0,"conversation_id_str":"1335381311255683072","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1335381311255683072"}}

View File

@ -3,6 +3,7 @@ package scraper
import ( import (
"time" "time"
"fmt" "fmt"
"strings"
"offline_twitter/terminal_utils" "offline_twitter/terminal_utils"
) )
@ -27,6 +28,7 @@ type Tweet struct {
Images []Image Images []Image
Videos []Video Videos []Video
Mentions []UserHandle Mentions []UserHandle
ReplyMentions []UserHandle
Hashtags []string Hashtags []string
QuotedTweet TweetID QuotedTweet TweetID
@ -115,6 +117,15 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
ret.Mentions = append(ret.Mentions, UserHandle(mention.UserName)) ret.Mentions = append(ret.Mentions, UserHandle(mention.UserName))
} }
for _, mention := range strings.Split(apiTweet.Entities.ReplyMentions, " ") {
if mention != "" {
if mention[0] != '@' {
panic(fmt.Sprintf("Unknown ReplyMention value: %s", apiTweet.Entities.ReplyMentions))
}
ret.ReplyMentions = append(ret.ReplyMentions, UserHandle(mention[1:]))
}
}
ret.QuotedTweet = TweetID(apiTweet.QuotedStatusID) ret.QuotedTweet = TweetID(apiTweet.QuotedStatusID)
for _, entity := range apiTweet.ExtendedEntities.Media { for _, entity := range apiTweet.ExtendedEntities.Media {

View File

@ -223,6 +223,31 @@ func TestParseTweetWithMultipleUrls(t *testing.T) {
} }
} }
func TestTweetWithLotsOfReplyMentions(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/single_tweets/tweet_with_at_mentions_in_front.json")
if err != nil {
panic(err)
}
var apitweet scraper.APITweet
err = json.Unmarshal(data, &apitweet)
if err != nil {
t.Errorf(err.Error())
}
tweet, err := scraper.ParseSingleTweet(apitweet)
if err != nil {
t.Errorf(err.Error())
}
if len(tweet.ReplyMentions) != 4 {
t.Errorf("Expected %d reply-mentions, got %d", 4, len(tweet.ReplyMentions))
}
for i, v := range []scraper.UserHandle{"rob_mose", "primalpoly", "jmasseypoet", "SpaceX"} {
if tweet.ReplyMentions[i] != v {
t.Errorf("Expected %q, got %q at position %d", v, tweet.ReplyMentions[i], i)
}
}
}
func TestParseTweetResponse(t *testing.T) { func TestParseTweetResponse(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json") data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")