Fix handling of reply-mentions and tweet text normalization
This commit is contained in:
parent
9ae6213025
commit
159084006d
@ -74,6 +74,7 @@ type APITweet struct {
|
|||||||
CreatedAt string `json:"created_at"`
|
CreatedAt string `json:"created_at"`
|
||||||
FavoriteCount int `json:"favorite_count"`
|
FavoriteCount int `json:"favorite_count"`
|
||||||
FullText string `json:"full_text"`
|
FullText string `json:"full_text"`
|
||||||
|
DisplayTextRange []int `json:"display_text_range"`
|
||||||
Entities struct {
|
Entities struct {
|
||||||
Hashtags []struct {
|
Hashtags []struct {
|
||||||
Text string `json:"text"`
|
Text string `json:"text"`
|
||||||
@ -87,46 +88,30 @@ type APITweet struct {
|
|||||||
UserName string `json:"screen_name"`
|
UserName string `json:"screen_name"`
|
||||||
UserID int64 `json:"id_str,string"`
|
UserID int64 `json:"id_str,string"`
|
||||||
} `json:"user_mentions"`
|
} `json:"user_mentions"`
|
||||||
|
ReplyMentions string // The leading part of the text which is cut off by "DisplayTextRange"
|
||||||
} `json:"entities"`
|
} `json:"entities"`
|
||||||
ExtendedEntities struct {
|
ExtendedEntities struct {
|
||||||
Media []APIExtendedMedia `json:"media"`
|
Media []APIExtendedMedia `json:"media"`
|
||||||
} `json:"extended_entities"`
|
} `json:"extended_entities"`
|
||||||
InReplyToStatusID int64 `json:"in_reply_to_status_id_str,string"`
|
InReplyToStatusID int64 `json:"in_reply_to_status_id_str,string"`
|
||||||
InReplyToScreenName string `json:"in_reply_to_screen_name"`
|
InReplyToScreenName string `json:"in_reply_to_screen_name"`
|
||||||
ReplyCount int `json:"reply_count"`
|
ReplyCount int `json:"reply_count"`
|
||||||
RetweetCount int `json:"retweet_count"`
|
RetweetCount int `json:"retweet_count"`
|
||||||
QuoteCount int `json:"quote_count"`
|
QuoteCount int `json:"quote_count"`
|
||||||
RetweetedStatusIDStr string `json:"retweeted_status_id_str"` // Can be empty string
|
RetweetedStatusIDStr string `json:"retweeted_status_id_str"` // Can be empty string
|
||||||
RetweetedStatusID int64
|
RetweetedStatusID int64
|
||||||
QuotedStatusIDStr string `json:"quoted_status_id_str"` // Can be empty string
|
QuotedStatusIDStr string `json:"quoted_status_id_str"` // Can be empty string
|
||||||
QuotedStatusID int64
|
QuotedStatusID int64
|
||||||
Time time.Time `json:"time"`
|
QuotedStatusPermalink struct {
|
||||||
UserID int64 `json:"user_id_str,string"`
|
URL string `json:"url"`
|
||||||
Card APICard `json:"card"`
|
ExpandedURL string `json:"expanded"`
|
||||||
|
} `json:"quoted_status_permalink"`
|
||||||
|
Time time.Time `json:"time"`
|
||||||
|
UserID int64 `json:"user_id_str,string"`
|
||||||
|
Card APICard `json:"card"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *APITweet) NormalizeContent() {
|
func (t *APITweet) NormalizeContent() {
|
||||||
// Remove embedded links at the end of the text
|
|
||||||
if len(t.Entities.URLs) == 1 { // TODO: should this be `>= 1`, like below?
|
|
||||||
url := t.Entities.URLs[0].ShortenedUrl
|
|
||||||
if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
|
|
||||||
t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the newline
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if len(t.Entities.Media) >= 1 {
|
|
||||||
url := t.Entities.Media[0].URL
|
|
||||||
if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
|
|
||||||
t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the trailing space
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Remove leading `@username` for replies
|
|
||||||
if t.InReplyToScreenName != "" {
|
|
||||||
if strings.Index(t.FullText, "@" + t.InReplyToScreenName) == 0 {
|
|
||||||
t.FullText = t.FullText[len(t.InReplyToScreenName) + 1:] // `@`, username, space
|
|
||||||
}
|
|
||||||
}
|
|
||||||
t.FullText = strings.TrimSpace(t.FullText)
|
|
||||||
|
|
||||||
id, err := strconv.Atoi(t.QuotedStatusIDStr)
|
id, err := strconv.Atoi(t.QuotedStatusIDStr)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
t.QuotedStatusID = int64(id)
|
t.QuotedStatusID = int64(id)
|
||||||
@ -135,6 +120,27 @@ func (t *APITweet) NormalizeContent() {
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
t.RetweetedStatusID = int64(id)
|
t.RetweetedStatusID = int64(id)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (len(t.DisplayTextRange) == 2) {
|
||||||
|
t.Entities.ReplyMentions = strings.TrimSpace(string([]rune(t.FullText)[0:t.DisplayTextRange[0]]))
|
||||||
|
t.FullText = string([]rune(t.FullText)[t.DisplayTextRange[0]:t.DisplayTextRange[1]])
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle threads
|
||||||
|
if (t.InReplyToScreenName != "" && t.Entities.ReplyMentions == "") {
|
||||||
|
// Identify a "thread" as a tweet that replies to something but there's no leading `@reply` text
|
||||||
|
t.Entities.ReplyMentions = "@" + t.InReplyToScreenName
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle pasted tweet links that turn into quote tweets but still have a link in them
|
||||||
|
if t.QuotedStatusID != 0 {
|
||||||
|
for _, url := range t.Entities.URLs {
|
||||||
|
if url.ShortenedUrl == t.QuotedStatusPermalink.URL {
|
||||||
|
t.FullText = strings.ReplaceAll(t.FullText, url.ShortenedUrl, "")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
t.FullText = strings.TrimSpace(t.FullText)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t APITweet) String() string {
|
func (t APITweet) String() string {
|
||||||
|
@ -16,15 +16,19 @@ func TestNormalizeContent(t *testing.T) {
|
|||||||
quoted_status_id scraper.TweetID
|
quoted_status_id scraper.TweetID
|
||||||
in_reply_to scraper.TweetID
|
in_reply_to scraper.TweetID
|
||||||
retweeted_status_id scraper.TweetID
|
retweeted_status_id scraper.TweetID
|
||||||
|
reply_mentions string
|
||||||
} {
|
} {
|
||||||
{"test_responses/single_tweets/tweet_that_is_a_reply_with_gif.json", "", 0, 1395882872729477131, 0},
|
{"test_responses/single_tweets/tweet_that_is_a_reply_with_gif.json", "", 0, 1395882872729477131, 0, "@michaelmalice"},
|
||||||
{"test_responses/single_tweets/tweet_with_image.json", "this saddens me every time", 0, 0, 0},
|
{"test_responses/single_tweets/tweet_with_image.json", "this saddens me every time", 0, 0, 0, ""},
|
||||||
{"test_responses/single_tweets/tweet_that_is_a_reply.json", "Noted", 0, 1396194494710788100, 0},
|
{"test_responses/single_tweets/tweet_that_is_a_reply.json", "Noted", 0, 1396194494710788100, 0, "@RvaTeddy @michaelmalice"},
|
||||||
{"test_responses/single_tweets/tweet_with_4_images.json", "These are public health officials who are making decisions about your lifestyle because they know more about health, fitness and well-being than you do", 0, 0, 0},
|
{"test_responses/single_tweets/tweet_with_4_images.json", "These are public health officials who are making decisions about your lifestyle because they know more about health, fitness and well-being than you do", 0, 0, 0, ""},
|
||||||
{"test_responses/single_tweets/tweet_with_quoted_tweet.json", "", 1422680899670274048, 0, 0},
|
{"test_responses/single_tweets/tweet_with_at_mentions_in_front.json", "It always does, doesn't it?", 0, 1428907275532476416, 0, "@rob_mose @primalpoly @jmasseypoet @SpaceX"},
|
||||||
{"test_responses/single_tweets/tweet_with_at_mentions_in_front.json", "It always does, doesn't it?", 0, 1428907275532476416, 0},
|
{"test_responses/single_tweets/tweet_with_unicode_chars.json", "The fact that @michaelmalice new book ‘The Anarchist Handbook’ is just absolutely destroying on the charts is the largest white pill I’ve swallowed in years.", 0, 0, 0, ""},
|
||||||
{"test_responses/single_tweets/tweet_with_unicode_chars.json", "The fact that @michaelmalice new book ‘The Anarchist Handbook’ is just absolutely destroying on the charts is the largest white pill I’ve swallowed in years.", 0, 0, 0},
|
{"test_responses/single_tweets/tweet_with_quoted_tweet_as_link.json", "", 1422680899670274048, 0, 0, ""},
|
||||||
|
{"test_responses/single_tweets/tweet_with_quoted_tweet_as_link2.json", "sometimes they're too dimwitted to even get the wrong title right", 1396194494710788100, 1395882872729477131, 0, "@michaelmalice"},
|
||||||
|
{"test_responses/single_tweets/tweet_with_quoted_tweet_as_link3.json", "I was using an analogy about creating out-groups but the Germans sure love their literalism", 1442092399358930946, 1335678942020300802, 0, "@michaelmalice"},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, v := range test_cases {
|
for _, v := range test_cases {
|
||||||
data, err := ioutil.ReadFile(v.filename)
|
data, err := ioutil.ReadFile(v.filename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -51,6 +55,9 @@ func TestNormalizeContent(t *testing.T) {
|
|||||||
if scraper.TweetID(tweet.RetweetedStatusID) != v.retweeted_status_id {
|
if scraper.TweetID(tweet.RetweetedStatusID) != v.retweeted_status_id {
|
||||||
t.Errorf("Expected retweeted status id %d, but got %d", v.retweeted_status_id, tweet.RetweetedStatusID)
|
t.Errorf("Expected retweeted status id %d, but got %d", v.retweeted_status_id, tweet.RetweetedStatusID)
|
||||||
}
|
}
|
||||||
|
if tweet.Entities.ReplyMentions != v.reply_mentions {
|
||||||
|
t.Errorf("Expected @reply mentions to be %q, but it was %q", v.reply_mentions, tweet.Entities.ReplyMentions)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1 @@
|
|||||||
|
{"created_at":"Sun Sep 26 19:32:46 +0000 2021","id_str":"1442210557046792199","full_text":"I was using an analogy about creating out-groups but the Germans sure love their literalism\n\nhttps://t.co/dCMA90L72V","display_text_range":[0,116],"entities":{"urls":[{"url":"https://t.co/dCMA90L72V","expanded_url":"https://twitter.com/CJHopkins_Z23/status/1442092399358930946","display_url":"twitter.com/CJHopkins_Z23/…","indices":[93,116]}]},"source":"<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>","in_reply_to_status_id_str":"1335678942020300802","in_reply_to_user_id_str":"44067298","in_reply_to_screen_name":"michaelmalice","user_id_str":"44067298","is_quote_status":true,"quoted_status_id_str":"1442092399358930946","quoted_status_permalink":{"url":"https://t.co/dCMA90L72V","expanded":"https://twitter.com/CJHopkins_Z23/status/1442092399358930946","display":"twitter.com/CJHopkins_Z23/…"},"retweet_count":36,"favorite_count":386,"reply_count":12,"quote_count":0,"conversation_id_str":"1335381311255683072","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1335381311255683072"}}
|
@ -3,6 +3,7 @@ package scraper
|
|||||||
import (
|
import (
|
||||||
"time"
|
"time"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"offline_twitter/terminal_utils"
|
"offline_twitter/terminal_utils"
|
||||||
)
|
)
|
||||||
@ -23,12 +24,13 @@ type Tweet struct {
|
|||||||
NumQuoteTweets int
|
NumQuoteTweets int
|
||||||
InReplyTo TweetID
|
InReplyTo TweetID
|
||||||
|
|
||||||
Urls []Url
|
Urls []Url
|
||||||
Images []Image
|
Images []Image
|
||||||
Videos []Video
|
Videos []Video
|
||||||
Mentions []UserHandle
|
Mentions []UserHandle
|
||||||
Hashtags []string
|
ReplyMentions []UserHandle
|
||||||
QuotedTweet TweetID
|
Hashtags []string
|
||||||
|
QuotedTweet TweetID
|
||||||
|
|
||||||
IsContentDownloaded bool
|
IsContentDownloaded bool
|
||||||
}
|
}
|
||||||
@ -115,6 +117,15 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
|||||||
ret.Mentions = append(ret.Mentions, UserHandle(mention.UserName))
|
ret.Mentions = append(ret.Mentions, UserHandle(mention.UserName))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, mention := range strings.Split(apiTweet.Entities.ReplyMentions, " ") {
|
||||||
|
if mention != "" {
|
||||||
|
if mention[0] != '@' {
|
||||||
|
panic(fmt.Sprintf("Unknown ReplyMention value: %s", apiTweet.Entities.ReplyMentions))
|
||||||
|
}
|
||||||
|
ret.ReplyMentions = append(ret.ReplyMentions, UserHandle(mention[1:]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ret.QuotedTweet = TweetID(apiTweet.QuotedStatusID)
|
ret.QuotedTweet = TweetID(apiTweet.QuotedStatusID)
|
||||||
|
|
||||||
for _, entity := range apiTweet.ExtendedEntities.Media {
|
for _, entity := range apiTweet.ExtendedEntities.Media {
|
||||||
|
@ -223,6 +223,31 @@ func TestParseTweetWithMultipleUrls(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestTweetWithLotsOfReplyMentions(t *testing.T) {
|
||||||
|
data, err := ioutil.ReadFile("test_responses/single_tweets/tweet_with_at_mentions_in_front.json")
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
var apitweet scraper.APITweet
|
||||||
|
err = json.Unmarshal(data, &apitweet)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf(err.Error())
|
||||||
|
}
|
||||||
|
tweet, err := scraper.ParseSingleTweet(apitweet)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf(err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(tweet.ReplyMentions) != 4 {
|
||||||
|
t.Errorf("Expected %d reply-mentions, got %d", 4, len(tweet.ReplyMentions))
|
||||||
|
}
|
||||||
|
for i, v := range []scraper.UserHandle{"rob_mose", "primalpoly", "jmasseypoet", "SpaceX"} {
|
||||||
|
if tweet.ReplyMentions[i] != v {
|
||||||
|
t.Errorf("Expected %q, got %q at position %d", v, tweet.ReplyMentions[i], i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
func TestParseTweetResponse(t *testing.T) {
|
func TestParseTweetResponse(t *testing.T) {
|
||||||
data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")
|
data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user