Fix handling of reply-mentions and tweet text normalization
This commit is contained in:
parent
9ae6213025
commit
159084006d
@ -74,6 +74,7 @@ type APITweet struct {
|
||||
CreatedAt string `json:"created_at"`
|
||||
FavoriteCount int `json:"favorite_count"`
|
||||
FullText string `json:"full_text"`
|
||||
DisplayTextRange []int `json:"display_text_range"`
|
||||
Entities struct {
|
||||
Hashtags []struct {
|
||||
Text string `json:"text"`
|
||||
@ -87,46 +88,30 @@ type APITweet struct {
|
||||
UserName string `json:"screen_name"`
|
||||
UserID int64 `json:"id_str,string"`
|
||||
} `json:"user_mentions"`
|
||||
ReplyMentions string // The leading part of the text which is cut off by "DisplayTextRange"
|
||||
} `json:"entities"`
|
||||
ExtendedEntities struct {
|
||||
Media []APIExtendedMedia `json:"media"`
|
||||
} `json:"extended_entities"`
|
||||
InReplyToStatusID int64 `json:"in_reply_to_status_id_str,string"`
|
||||
InReplyToScreenName string `json:"in_reply_to_screen_name"`
|
||||
ReplyCount int `json:"reply_count"`
|
||||
RetweetCount int `json:"retweet_count"`
|
||||
QuoteCount int `json:"quote_count"`
|
||||
RetweetedStatusIDStr string `json:"retweeted_status_id_str"` // Can be empty string
|
||||
RetweetedStatusID int64
|
||||
QuotedStatusIDStr string `json:"quoted_status_id_str"` // Can be empty string
|
||||
QuotedStatusID int64
|
||||
Time time.Time `json:"time"`
|
||||
UserID int64 `json:"user_id_str,string"`
|
||||
Card APICard `json:"card"`
|
||||
InReplyToStatusID int64 `json:"in_reply_to_status_id_str,string"`
|
||||
InReplyToScreenName string `json:"in_reply_to_screen_name"`
|
||||
ReplyCount int `json:"reply_count"`
|
||||
RetweetCount int `json:"retweet_count"`
|
||||
QuoteCount int `json:"quote_count"`
|
||||
RetweetedStatusIDStr string `json:"retweeted_status_id_str"` // Can be empty string
|
||||
RetweetedStatusID int64
|
||||
QuotedStatusIDStr string `json:"quoted_status_id_str"` // Can be empty string
|
||||
QuotedStatusID int64
|
||||
QuotedStatusPermalink struct {
|
||||
URL string `json:"url"`
|
||||
ExpandedURL string `json:"expanded"`
|
||||
} `json:"quoted_status_permalink"`
|
||||
Time time.Time `json:"time"`
|
||||
UserID int64 `json:"user_id_str,string"`
|
||||
Card APICard `json:"card"`
|
||||
}
|
||||
|
||||
func (t *APITweet) NormalizeContent() {
|
||||
// Remove embedded links at the end of the text
|
||||
if len(t.Entities.URLs) == 1 { // TODO: should this be `>= 1`, like below?
|
||||
url := t.Entities.URLs[0].ShortenedUrl
|
||||
if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
|
||||
t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the newline
|
||||
}
|
||||
}
|
||||
if len(t.Entities.Media) >= 1 {
|
||||
url := t.Entities.Media[0].URL
|
||||
if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
|
||||
t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the trailing space
|
||||
}
|
||||
}
|
||||
// Remove leading `@username` for replies
|
||||
if t.InReplyToScreenName != "" {
|
||||
if strings.Index(t.FullText, "@" + t.InReplyToScreenName) == 0 {
|
||||
t.FullText = t.FullText[len(t.InReplyToScreenName) + 1:] // `@`, username, space
|
||||
}
|
||||
}
|
||||
t.FullText = strings.TrimSpace(t.FullText)
|
||||
|
||||
id, err := strconv.Atoi(t.QuotedStatusIDStr)
|
||||
if err == nil {
|
||||
t.QuotedStatusID = int64(id)
|
||||
@ -135,6 +120,27 @@ func (t *APITweet) NormalizeContent() {
|
||||
if err == nil {
|
||||
t.RetweetedStatusID = int64(id)
|
||||
}
|
||||
|
||||
if (len(t.DisplayTextRange) == 2) {
|
||||
t.Entities.ReplyMentions = strings.TrimSpace(string([]rune(t.FullText)[0:t.DisplayTextRange[0]]))
|
||||
t.FullText = string([]rune(t.FullText)[t.DisplayTextRange[0]:t.DisplayTextRange[1]])
|
||||
}
|
||||
|
||||
// Handle threads
|
||||
if (t.InReplyToScreenName != "" && t.Entities.ReplyMentions == "") {
|
||||
// Identify a "thread" as a tweet that replies to something but there's no leading `@reply` text
|
||||
t.Entities.ReplyMentions = "@" + t.InReplyToScreenName
|
||||
}
|
||||
|
||||
// Handle pasted tweet links that turn into quote tweets but still have a link in them
|
||||
if t.QuotedStatusID != 0 {
|
||||
for _, url := range t.Entities.URLs {
|
||||
if url.ShortenedUrl == t.QuotedStatusPermalink.URL {
|
||||
t.FullText = strings.ReplaceAll(t.FullText, url.ShortenedUrl, "")
|
||||
}
|
||||
}
|
||||
}
|
||||
t.FullText = strings.TrimSpace(t.FullText)
|
||||
}
|
||||
|
||||
func (t APITweet) String() string {
|
||||
|
@ -16,15 +16,19 @@ func TestNormalizeContent(t *testing.T) {
|
||||
quoted_status_id scraper.TweetID
|
||||
in_reply_to scraper.TweetID
|
||||
retweeted_status_id scraper.TweetID
|
||||
reply_mentions string
|
||||
} {
|
||||
{"test_responses/single_tweets/tweet_that_is_a_reply_with_gif.json", "", 0, 1395882872729477131, 0},
|
||||
{"test_responses/single_tweets/tweet_with_image.json", "this saddens me every time", 0, 0, 0},
|
||||
{"test_responses/single_tweets/tweet_that_is_a_reply.json", "Noted", 0, 1396194494710788100, 0},
|
||||
{"test_responses/single_tweets/tweet_with_4_images.json", "These are public health officials who are making decisions about your lifestyle because they know more about health, fitness and well-being than you do", 0, 0, 0},
|
||||
{"test_responses/single_tweets/tweet_with_quoted_tweet.json", "", 1422680899670274048, 0, 0},
|
||||
{"test_responses/single_tweets/tweet_with_at_mentions_in_front.json", "It always does, doesn't it?", 0, 1428907275532476416, 0},
|
||||
{"test_responses/single_tweets/tweet_with_unicode_chars.json", "The fact that @michaelmalice new book ‘The Anarchist Handbook’ is just absolutely destroying on the charts is the largest white pill I’ve swallowed in years.", 0, 0, 0},
|
||||
{"test_responses/single_tweets/tweet_that_is_a_reply_with_gif.json", "", 0, 1395882872729477131, 0, "@michaelmalice"},
|
||||
{"test_responses/single_tweets/tweet_with_image.json", "this saddens me every time", 0, 0, 0, ""},
|
||||
{"test_responses/single_tweets/tweet_that_is_a_reply.json", "Noted", 0, 1396194494710788100, 0, "@RvaTeddy @michaelmalice"},
|
||||
{"test_responses/single_tweets/tweet_with_4_images.json", "These are public health officials who are making decisions about your lifestyle because they know more about health, fitness and well-being than you do", 0, 0, 0, ""},
|
||||
{"test_responses/single_tweets/tweet_with_at_mentions_in_front.json", "It always does, doesn't it?", 0, 1428907275532476416, 0, "@rob_mose @primalpoly @jmasseypoet @SpaceX"},
|
||||
{"test_responses/single_tweets/tweet_with_unicode_chars.json", "The fact that @michaelmalice new book ‘The Anarchist Handbook’ is just absolutely destroying on the charts is the largest white pill I’ve swallowed in years.", 0, 0, 0, ""},
|
||||
{"test_responses/single_tweets/tweet_with_quoted_tweet_as_link.json", "", 1422680899670274048, 0, 0, ""},
|
||||
{"test_responses/single_tweets/tweet_with_quoted_tweet_as_link2.json", "sometimes they're too dimwitted to even get the wrong title right", 1396194494710788100, 1395882872729477131, 0, "@michaelmalice"},
|
||||
{"test_responses/single_tweets/tweet_with_quoted_tweet_as_link3.json", "I was using an analogy about creating out-groups but the Germans sure love their literalism", 1442092399358930946, 1335678942020300802, 0, "@michaelmalice"},
|
||||
}
|
||||
|
||||
for _, v := range test_cases {
|
||||
data, err := ioutil.ReadFile(v.filename)
|
||||
if err != nil {
|
||||
@ -51,6 +55,9 @@ func TestNormalizeContent(t *testing.T) {
|
||||
if scraper.TweetID(tweet.RetweetedStatusID) != v.retweeted_status_id {
|
||||
t.Errorf("Expected retweeted status id %d, but got %d", v.retweeted_status_id, tweet.RetweetedStatusID)
|
||||
}
|
||||
if tweet.Entities.ReplyMentions != v.reply_mentions {
|
||||
t.Errorf("Expected @reply mentions to be %q, but it was %q", v.reply_mentions, tweet.Entities.ReplyMentions)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -0,0 +1 @@
|
||||
{"created_at":"Sun Sep 26 19:32:46 +0000 2021","id_str":"1442210557046792199","full_text":"I was using an analogy about creating out-groups but the Germans sure love their literalism\n\nhttps://t.co/dCMA90L72V","display_text_range":[0,116],"entities":{"urls":[{"url":"https://t.co/dCMA90L72V","expanded_url":"https://twitter.com/CJHopkins_Z23/status/1442092399358930946","display_url":"twitter.com/CJHopkins_Z23/…","indices":[93,116]}]},"source":"<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>","in_reply_to_status_id_str":"1335678942020300802","in_reply_to_user_id_str":"44067298","in_reply_to_screen_name":"michaelmalice","user_id_str":"44067298","is_quote_status":true,"quoted_status_id_str":"1442092399358930946","quoted_status_permalink":{"url":"https://t.co/dCMA90L72V","expanded":"https://twitter.com/CJHopkins_Z23/status/1442092399358930946","display":"twitter.com/CJHopkins_Z23/…"},"retweet_count":36,"favorite_count":386,"reply_count":12,"quote_count":0,"conversation_id_str":"1335381311255683072","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1335381311255683072"}}
|
@ -3,6 +3,7 @@ package scraper
|
||||
import (
|
||||
"time"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"offline_twitter/terminal_utils"
|
||||
)
|
||||
@ -23,12 +24,13 @@ type Tweet struct {
|
||||
NumQuoteTweets int
|
||||
InReplyTo TweetID
|
||||
|
||||
Urls []Url
|
||||
Images []Image
|
||||
Videos []Video
|
||||
Mentions []UserHandle
|
||||
Hashtags []string
|
||||
QuotedTweet TweetID
|
||||
Urls []Url
|
||||
Images []Image
|
||||
Videos []Video
|
||||
Mentions []UserHandle
|
||||
ReplyMentions []UserHandle
|
||||
Hashtags []string
|
||||
QuotedTweet TweetID
|
||||
|
||||
IsContentDownloaded bool
|
||||
}
|
||||
@ -115,6 +117,15 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
||||
ret.Mentions = append(ret.Mentions, UserHandle(mention.UserName))
|
||||
}
|
||||
|
||||
for _, mention := range strings.Split(apiTweet.Entities.ReplyMentions, " ") {
|
||||
if mention != "" {
|
||||
if mention[0] != '@' {
|
||||
panic(fmt.Sprintf("Unknown ReplyMention value: %s", apiTweet.Entities.ReplyMentions))
|
||||
}
|
||||
ret.ReplyMentions = append(ret.ReplyMentions, UserHandle(mention[1:]))
|
||||
}
|
||||
}
|
||||
|
||||
ret.QuotedTweet = TweetID(apiTweet.QuotedStatusID)
|
||||
|
||||
for _, entity := range apiTweet.ExtendedEntities.Media {
|
||||
|
@ -223,6 +223,31 @@ func TestParseTweetWithMultipleUrls(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTweetWithLotsOfReplyMentions(t *testing.T) {
|
||||
data, err := ioutil.ReadFile("test_responses/single_tweets/tweet_with_at_mentions_in_front.json")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
var apitweet scraper.APITweet
|
||||
err = json.Unmarshal(data, &apitweet)
|
||||
if err != nil {
|
||||
t.Errorf(err.Error())
|
||||
}
|
||||
tweet, err := scraper.ParseSingleTweet(apitweet)
|
||||
if err != nil {
|
||||
t.Errorf(err.Error())
|
||||
}
|
||||
|
||||
if len(tweet.ReplyMentions) != 4 {
|
||||
t.Errorf("Expected %d reply-mentions, got %d", 4, len(tweet.ReplyMentions))
|
||||
}
|
||||
for i, v := range []scraper.UserHandle{"rob_mose", "primalpoly", "jmasseypoet", "SpaceX"} {
|
||||
if tweet.ReplyMentions[i] != v {
|
||||
t.Errorf("Expected %q, got %q at position %d", v, tweet.ReplyMentions[i], i)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
func TestParseTweetResponse(t *testing.T) {
|
||||
data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")
|
||||
|
Loading…
x
Reference in New Issue
Block a user