From 159084006dcee6d48002012828cbe04ff301a1a2 Mon Sep 17 00:00:00 2001 From: Alessio Date: Mon, 27 Sep 2021 18:12:28 -0700 Subject: [PATCH] Fix handling of reply-mentions and tweet text normalization --- scraper/api_types.go | 72 ++++++++++--------- scraper/api_types_test.go | 21 ++++-- ...n => tweet_with_quoted_tweet_as_link.json} | 0 .../tweet_with_quoted_tweet_as_link3.json | 1 + scraper/tweet.go | 23 ++++-- scraper/tweet_test.go | 25 +++++++ 6 files changed, 96 insertions(+), 46 deletions(-) rename scraper/test_responses/single_tweets/{tweet_with_quoted_tweet.json => tweet_with_quoted_tweet_as_link.json} (100%) create mode 100644 scraper/test_responses/single_tweets/tweet_with_quoted_tweet_as_link3.json diff --git a/scraper/api_types.go b/scraper/api_types.go index baefbd6..2063851 100644 --- a/scraper/api_types.go +++ b/scraper/api_types.go @@ -74,6 +74,7 @@ type APITweet struct { CreatedAt string `json:"created_at"` FavoriteCount int `json:"favorite_count"` FullText string `json:"full_text"` + DisplayTextRange []int `json:"display_text_range"` Entities struct { Hashtags []struct { Text string `json:"text"` @@ -87,46 +88,30 @@ type APITweet struct { UserName string `json:"screen_name"` UserID int64 `json:"id_str,string"` } `json:"user_mentions"` + ReplyMentions string // The leading part of the text which is cut off by "DisplayTextRange" } `json:"entities"` ExtendedEntities struct { Media []APIExtendedMedia `json:"media"` } `json:"extended_entities"` - InReplyToStatusID int64 `json:"in_reply_to_status_id_str,string"` - InReplyToScreenName string `json:"in_reply_to_screen_name"` - ReplyCount int `json:"reply_count"` - RetweetCount int `json:"retweet_count"` - QuoteCount int `json:"quote_count"` - RetweetedStatusIDStr string `json:"retweeted_status_id_str"` // Can be empty string - RetweetedStatusID int64 - QuotedStatusIDStr string `json:"quoted_status_id_str"` // Can be empty string - QuotedStatusID int64 - Time time.Time `json:"time"` - UserID int64 `json:"user_id_str,string"` - Card APICard `json:"card"` + InReplyToStatusID int64 `json:"in_reply_to_status_id_str,string"` + InReplyToScreenName string `json:"in_reply_to_screen_name"` + ReplyCount int `json:"reply_count"` + RetweetCount int `json:"retweet_count"` + QuoteCount int `json:"quote_count"` + RetweetedStatusIDStr string `json:"retweeted_status_id_str"` // Can be empty string + RetweetedStatusID int64 + QuotedStatusIDStr string `json:"quoted_status_id_str"` // Can be empty string + QuotedStatusID int64 + QuotedStatusPermalink struct { + URL string `json:"url"` + ExpandedURL string `json:"expanded"` + } `json:"quoted_status_permalink"` + Time time.Time `json:"time"` + UserID int64 `json:"user_id_str,string"` + Card APICard `json:"card"` } func (t *APITweet) NormalizeContent() { - // Remove embedded links at the end of the text - if len(t.Entities.URLs) == 1 { // TODO: should this be `>= 1`, like below? - url := t.Entities.URLs[0].ShortenedUrl - if strings.Index(t.FullText, url) == len(t.FullText) - len(url) { - t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the newline - } - } - if len(t.Entities.Media) >= 1 { - url := t.Entities.Media[0].URL - if strings.Index(t.FullText, url) == len(t.FullText) - len(url) { - t.FullText = t.FullText[0:len(t.FullText) - len(url)] // Also strip the trailing space - } - } - // Remove leading `@username` for replies - if t.InReplyToScreenName != "" { - if strings.Index(t.FullText, "@" + t.InReplyToScreenName) == 0 { - t.FullText = t.FullText[len(t.InReplyToScreenName) + 1:] // `@`, username, space - } - } - t.FullText = strings.TrimSpace(t.FullText) - id, err := strconv.Atoi(t.QuotedStatusIDStr) if err == nil { t.QuotedStatusID = int64(id) @@ -135,6 +120,27 @@ func (t *APITweet) NormalizeContent() { if err == nil { t.RetweetedStatusID = int64(id) } + + if (len(t.DisplayTextRange) == 2) { + t.Entities.ReplyMentions = strings.TrimSpace(string([]rune(t.FullText)[0:t.DisplayTextRange[0]])) + t.FullText = string([]rune(t.FullText)[t.DisplayTextRange[0]:t.DisplayTextRange[1]]) + } + + // Handle threads + if (t.InReplyToScreenName != "" && t.Entities.ReplyMentions == "") { + // Identify a "thread" as a tweet that replies to something but there's no leading `@reply` text + t.Entities.ReplyMentions = "@" + t.InReplyToScreenName + } + + // Handle pasted tweet links that turn into quote tweets but still have a link in them + if t.QuotedStatusID != 0 { + for _, url := range t.Entities.URLs { + if url.ShortenedUrl == t.QuotedStatusPermalink.URL { + t.FullText = strings.ReplaceAll(t.FullText, url.ShortenedUrl, "") + } + } + } + t.FullText = strings.TrimSpace(t.FullText) } func (t APITweet) String() string { diff --git a/scraper/api_types_test.go b/scraper/api_types_test.go index 4db850c..643c8bf 100644 --- a/scraper/api_types_test.go +++ b/scraper/api_types_test.go @@ -16,15 +16,19 @@ func TestNormalizeContent(t *testing.T) { quoted_status_id scraper.TweetID in_reply_to scraper.TweetID retweeted_status_id scraper.TweetID + reply_mentions string } { - {"test_responses/single_tweets/tweet_that_is_a_reply_with_gif.json", "", 0, 1395882872729477131, 0}, - {"test_responses/single_tweets/tweet_with_image.json", "this saddens me every time", 0, 0, 0}, - {"test_responses/single_tweets/tweet_that_is_a_reply.json", "Noted", 0, 1396194494710788100, 0}, - {"test_responses/single_tweets/tweet_with_4_images.json", "These are public health officials who are making decisions about your lifestyle because they know more about health, fitness and well-being than you do", 0, 0, 0}, - {"test_responses/single_tweets/tweet_with_quoted_tweet.json", "", 1422680899670274048, 0, 0}, - {"test_responses/single_tweets/tweet_with_at_mentions_in_front.json", "It always does, doesn't it?", 0, 1428907275532476416, 0}, - {"test_responses/single_tweets/tweet_with_unicode_chars.json", "The fact that @michaelmalice new book ‘The Anarchist Handbook’ is just absolutely destroying on the charts is the largest white pill I’ve swallowed in years.", 0, 0, 0}, + {"test_responses/single_tweets/tweet_that_is_a_reply_with_gif.json", "", 0, 1395882872729477131, 0, "@michaelmalice"}, + {"test_responses/single_tweets/tweet_with_image.json", "this saddens me every time", 0, 0, 0, ""}, + {"test_responses/single_tweets/tweet_that_is_a_reply.json", "Noted", 0, 1396194494710788100, 0, "@RvaTeddy @michaelmalice"}, + {"test_responses/single_tweets/tweet_with_4_images.json", "These are public health officials who are making decisions about your lifestyle because they know more about health, fitness and well-being than you do", 0, 0, 0, ""}, + {"test_responses/single_tweets/tweet_with_at_mentions_in_front.json", "It always does, doesn't it?", 0, 1428907275532476416, 0, "@rob_mose @primalpoly @jmasseypoet @SpaceX"}, + {"test_responses/single_tweets/tweet_with_unicode_chars.json", "The fact that @michaelmalice new book ‘The Anarchist Handbook’ is just absolutely destroying on the charts is the largest white pill I’ve swallowed in years.", 0, 0, 0, ""}, + {"test_responses/single_tweets/tweet_with_quoted_tweet_as_link.json", "", 1422680899670274048, 0, 0, ""}, + {"test_responses/single_tweets/tweet_with_quoted_tweet_as_link2.json", "sometimes they're too dimwitted to even get the wrong title right", 1396194494710788100, 1395882872729477131, 0, "@michaelmalice"}, + {"test_responses/single_tweets/tweet_with_quoted_tweet_as_link3.json", "I was using an analogy about creating out-groups but the Germans sure love their literalism", 1442092399358930946, 1335678942020300802, 0, "@michaelmalice"}, } + for _, v := range test_cases { data, err := ioutil.ReadFile(v.filename) if err != nil { @@ -51,6 +55,9 @@ func TestNormalizeContent(t *testing.T) { if scraper.TweetID(tweet.RetweetedStatusID) != v.retweeted_status_id { t.Errorf("Expected retweeted status id %d, but got %d", v.retweeted_status_id, tweet.RetweetedStatusID) } + if tweet.Entities.ReplyMentions != v.reply_mentions { + t.Errorf("Expected @reply mentions to be %q, but it was %q", v.reply_mentions, tweet.Entities.ReplyMentions) + } } } diff --git a/scraper/test_responses/single_tweets/tweet_with_quoted_tweet.json b/scraper/test_responses/single_tweets/tweet_with_quoted_tweet_as_link.json similarity index 100% rename from scraper/test_responses/single_tweets/tweet_with_quoted_tweet.json rename to scraper/test_responses/single_tweets/tweet_with_quoted_tweet_as_link.json diff --git a/scraper/test_responses/single_tweets/tweet_with_quoted_tweet_as_link3.json b/scraper/test_responses/single_tweets/tweet_with_quoted_tweet_as_link3.json new file mode 100644 index 0000000..406d26c --- /dev/null +++ b/scraper/test_responses/single_tweets/tweet_with_quoted_tweet_as_link3.json @@ -0,0 +1 @@ +{"created_at":"Sun Sep 26 19:32:46 +0000 2021","id_str":"1442210557046792199","full_text":"I was using an analogy about creating out-groups but the Germans sure love their literalism\n\nhttps://t.co/dCMA90L72V","display_text_range":[0,116],"entities":{"urls":[{"url":"https://t.co/dCMA90L72V","expanded_url":"https://twitter.com/CJHopkins_Z23/status/1442092399358930946","display_url":"twitter.com/CJHopkins_Z23/…","indices":[93,116]}]},"source":"Twitter Web App","in_reply_to_status_id_str":"1335678942020300802","in_reply_to_user_id_str":"44067298","in_reply_to_screen_name":"michaelmalice","user_id_str":"44067298","is_quote_status":true,"quoted_status_id_str":"1442092399358930946","quoted_status_permalink":{"url":"https://t.co/dCMA90L72V","expanded":"https://twitter.com/CJHopkins_Z23/status/1442092399358930946","display":"twitter.com/CJHopkins_Z23/…"},"retweet_count":36,"favorite_count":386,"reply_count":12,"quote_count":0,"conversation_id_str":"1335381311255683072","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1335381311255683072"}} diff --git a/scraper/tweet.go b/scraper/tweet.go index 685dadb..08cc57e 100644 --- a/scraper/tweet.go +++ b/scraper/tweet.go @@ -3,6 +3,7 @@ package scraper import ( "time" "fmt" + "strings" "offline_twitter/terminal_utils" ) @@ -23,12 +24,13 @@ type Tweet struct { NumQuoteTweets int InReplyTo TweetID - Urls []Url - Images []Image - Videos []Video - Mentions []UserHandle - Hashtags []string - QuotedTweet TweetID + Urls []Url + Images []Image + Videos []Video + Mentions []UserHandle + ReplyMentions []UserHandle + Hashtags []string + QuotedTweet TweetID IsContentDownloaded bool } @@ -115,6 +117,15 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { ret.Mentions = append(ret.Mentions, UserHandle(mention.UserName)) } + for _, mention := range strings.Split(apiTweet.Entities.ReplyMentions, " ") { + if mention != "" { + if mention[0] != '@' { + panic(fmt.Sprintf("Unknown ReplyMention value: %s", apiTweet.Entities.ReplyMentions)) + } + ret.ReplyMentions = append(ret.ReplyMentions, UserHandle(mention[1:])) + } + } + ret.QuotedTweet = TweetID(apiTweet.QuotedStatusID) for _, entity := range apiTweet.ExtendedEntities.Media { diff --git a/scraper/tweet_test.go b/scraper/tweet_test.go index e6888b0..f574823 100644 --- a/scraper/tweet_test.go +++ b/scraper/tweet_test.go @@ -223,6 +223,31 @@ func TestParseTweetWithMultipleUrls(t *testing.T) { } } +func TestTweetWithLotsOfReplyMentions(t *testing.T) { + data, err := ioutil.ReadFile("test_responses/single_tweets/tweet_with_at_mentions_in_front.json") + if err != nil { + panic(err) + } + var apitweet scraper.APITweet + err = json.Unmarshal(data, &apitweet) + if err != nil { + t.Errorf(err.Error()) + } + tweet, err := scraper.ParseSingleTweet(apitweet) + if err != nil { + t.Errorf(err.Error()) + } + + if len(tweet.ReplyMentions) != 4 { + t.Errorf("Expected %d reply-mentions, got %d", 4, len(tweet.ReplyMentions)) + } + for i, v := range []scraper.UserHandle{"rob_mose", "primalpoly", "jmasseypoet", "SpaceX"} { + if tweet.ReplyMentions[i] != v { + t.Errorf("Expected %q, got %q at position %d", v, tweet.ReplyMentions[i], i) + } + } +} + func TestParseTweetResponse(t *testing.T) { data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")