Fix handling of reply-mentions and tweet text normalization

2021-09-27 18:12:28 -07:00 · 2021-09-27 18:12:28 -07:00 · 159084006d
commit 159084006d
parent 9ae6213025
6 changed files with 96 additions and 46 deletions
--- a/scraper/api_types.go
+++ b/scraper/api_types.go
@ -74,6 +74,7 @@ type APITweet struct {
 	CreatedAt         string `json:"created_at"`
 	FavoriteCount     int    `json:"favorite_count"`
 	FullText          string `json:"full_text"`
+	DisplayTextRange  []int  `json:"display_text_range"`
 	Entities          struct {
 		Hashtags []struct {
 			Text string `json:"text"`
@ -87,46 +88,30 @@ type APITweet struct {
 			UserName string `json:"screen_name"`
 			UserID   int64  `json:"id_str,string"`
 		} `json:"user_mentions"`
+		ReplyMentions string  // The leading part of the text which is cut off by "DisplayTextRange"
 	} `json:"entities"`
 	ExtendedEntities struct {
 		Media []APIExtendedMedia `json:"media"`
 	} `json:"extended_entities"`
-	InReplyToStatusID    int64     `json:"in_reply_to_status_id_str,string"`
-	InReplyToScreenName  string    `json:"in_reply_to_screen_name"`
-	ReplyCount           int       `json:"reply_count"`
-	RetweetCount         int       `json:"retweet_count"`
-	QuoteCount           int       `json:"quote_count"`
-	RetweetedStatusIDStr string    `json:"retweeted_status_id_str"`  // Can be empty string
-	RetweetedStatusID    int64
-	QuotedStatusIDStr    string    `json:"quoted_status_id_str"`     // Can be empty string
-	QuotedStatusID       int64
-	Time                 time.Time `json:"time"`
-	UserID               int64     `json:"user_id_str,string"`
-	Card                 APICard   `json:"card"`
+	InReplyToStatusID     int64     `json:"in_reply_to_status_id_str,string"`
+	InReplyToScreenName   string    `json:"in_reply_to_screen_name"`
+	ReplyCount            int       `json:"reply_count"`
+	RetweetCount          int       `json:"retweet_count"`
+	QuoteCount            int       `json:"quote_count"`
+	RetweetedStatusIDStr  string    `json:"retweeted_status_id_str"`  // Can be empty string
+	RetweetedStatusID     int64
+	QuotedStatusIDStr     string    `json:"quoted_status_id_str"`     // Can be empty string
+	QuotedStatusID        int64
+	QuotedStatusPermalink struct {
+		URL         string `json:"url"`
+		ExpandedURL string `json:"expanded"`
+	} `json:"quoted_status_permalink"`
+	Time                  time.Time `json:"time"`
+	UserID                int64     `json:"user_id_str,string"`
+	Card                  APICard   `json:"card"`
 }

 func (t *APITweet) NormalizeContent() {
-	// Remove embedded links at the end of the text
-	if len(t.Entities.URLs) == 1 {  // TODO: should this be `>= 1`, like below?
-		url := t.Entities.URLs[0].ShortenedUrl
-		if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
-			t.FullText = t.FullText[0:len(t.FullText) - len(url)]  // Also strip the newline
-		}
-	}
-	if len(t.Entities.Media) >= 1 {
-		url := t.Entities.Media[0].URL
-		if strings.Index(t.FullText, url) == len(t.FullText) - len(url) {
-			t.FullText = t.FullText[0:len(t.FullText) - len(url)]  // Also strip the trailing space
-		}
-	}
-	// Remove leading `@username` for replies
-	if t.InReplyToScreenName != "" {
-		if strings.Index(t.FullText, "@" + t.InReplyToScreenName) == 0 {
-			t.FullText = t.FullText[len(t.InReplyToScreenName) + 1:]  // `@`, username, space
-		}
-	}
-	t.FullText = strings.TrimSpace(t.FullText)
-
 	id, err := strconv.Atoi(t.QuotedStatusIDStr)
 	if err == nil {
 		t.QuotedStatusID = int64(id)
@ -135,6 +120,27 @@ func (t *APITweet) NormalizeContent() {
 	if err == nil {
 		t.RetweetedStatusID = int64(id)
 	}
+
+	if (len(t.DisplayTextRange) == 2) {
+		t.Entities.ReplyMentions = strings.TrimSpace(string([]rune(t.FullText)[0:t.DisplayTextRange[0]]))
+		t.FullText = string([]rune(t.FullText)[t.DisplayTextRange[0]:t.DisplayTextRange[1]])
+	}
+
+	// Handle threads
+	if (t.InReplyToScreenName != "" && t.Entities.ReplyMentions == "") {
+		// Identify a "thread" as a tweet that replies to something but there's no leading `@reply` text
+		t.Entities.ReplyMentions = "@" + t.InReplyToScreenName
+	}
+
+	// Handle pasted tweet links that turn into quote tweets but still have a link in them
+	if t.QuotedStatusID != 0 {
+		for _, url := range t.Entities.URLs {
+			if url.ShortenedUrl == t.QuotedStatusPermalink.URL {
+				t.FullText = strings.ReplaceAll(t.FullText, url.ShortenedUrl, "")
+			}
+		}
+	}
+	t.FullText = strings.TrimSpace(t.FullText)
 }

 func (t APITweet) String() string {
--- a/scraper/api_types_test.go
+++ b/scraper/api_types_test.go
@ -16,15 +16,19 @@ func TestNormalizeContent(t *testing.T) {
 		quoted_status_id scraper.TweetID
 		in_reply_to scraper.TweetID
 		retweeted_status_id scraper.TweetID
+		reply_mentions string
 	} {
-		{"test_responses/single_tweets/tweet_that_is_a_reply_with_gif.json", "", 0, 1395882872729477131, 0},
-		{"test_responses/single_tweets/tweet_with_image.json", "this saddens me every time", 0, 0, 0},
-		{"test_responses/single_tweets/tweet_that_is_a_reply.json", "Noted", 0, 1396194494710788100, 0},
-		{"test_responses/single_tweets/tweet_with_4_images.json", "These are public health officials who are making decisions about your lifestyle because they know more about health, fitness and well-being than you do", 0, 0, 0},
-		{"test_responses/single_tweets/tweet_with_quoted_tweet.json", "", 1422680899670274048, 0, 0},
-		{"test_responses/single_tweets/tweet_with_at_mentions_in_front.json", "It always does, doesn't it?", 0, 1428907275532476416, 0},
-		{"test_responses/single_tweets/tweet_with_unicode_chars.json", "The fact that @michaelmalice new book ‘The Anarchist Handbook’ is just absolutely destroying on the charts is the largest white pill I’ve swallowed in years.", 0, 0, 0},
+		{"test_responses/single_tweets/tweet_that_is_a_reply_with_gif.json", "", 0, 1395882872729477131, 0, "@michaelmalice"},
+		{"test_responses/single_tweets/tweet_with_image.json", "this saddens me every time", 0, 0, 0, ""},
+		{"test_responses/single_tweets/tweet_that_is_a_reply.json", "Noted", 0, 1396194494710788100, 0, "@RvaTeddy @michaelmalice"},
+		{"test_responses/single_tweets/tweet_with_4_images.json", "These are public health officials who are making decisions about your lifestyle because they know more about health, fitness and well-being than you do", 0, 0, 0, ""},
+		{"test_responses/single_tweets/tweet_with_at_mentions_in_front.json", "It always does, doesn't it?", 0, 1428907275532476416, 0, "@rob_mose @primalpoly @jmasseypoet @SpaceX"},
+		{"test_responses/single_tweets/tweet_with_unicode_chars.json", "The fact that @michaelmalice new book ‘The Anarchist Handbook’ is just absolutely destroying on the charts is the largest white pill I’ve swallowed in years.", 0, 0, 0, ""},
+		{"test_responses/single_tweets/tweet_with_quoted_tweet_as_link.json", "", 1422680899670274048, 0, 0, ""},
+		{"test_responses/single_tweets/tweet_with_quoted_tweet_as_link2.json", "sometimes they're too dimwitted to even get the wrong title right", 1396194494710788100, 1395882872729477131, 0, "@michaelmalice"},
+		{"test_responses/single_tweets/tweet_with_quoted_tweet_as_link3.json", "I was using an analogy about creating out-groups but the Germans sure love their literalism", 1442092399358930946, 1335678942020300802, 0, "@michaelmalice"},
 	}
+
 	for _, v := range test_cases {
 		data, err := ioutil.ReadFile(v.filename)
 		if err != nil {
@ -51,6 +55,9 @@ func TestNormalizeContent(t *testing.T) {
 		if scraper.TweetID(tweet.RetweetedStatusID) != v.retweeted_status_id {
 			t.Errorf("Expected retweeted status id %d, but got %d", v.retweeted_status_id, tweet.RetweetedStatusID)
 		}
+		if tweet.Entities.ReplyMentions != v.reply_mentions {
+			t.Errorf("Expected @reply mentions to be %q, but it was %q", v.reply_mentions, tweet.Entities.ReplyMentions)
+		}
 	}
 }

--- a/scraper/test_responses/single_tweets/tweet_with_quoted_tweet_as_link.json
+++ b/scraper/test_responses/single_tweets/tweet_with_quoted_tweet_as_link.json
--- a/scraper/test_responses/single_tweets/tweet_with_quoted_tweet_as_link3.json
+++ b/scraper/test_responses/single_tweets/tweet_with_quoted_tweet_as_link3.json
@ -0,0 +1 @@
+{"created_at":"Sun Sep 26 19:32:46 +0000 2021","id_str":"1442210557046792199","full_text":"I was using an analogy about creating out-groups but the Germans sure love their literalism\n\nhttps://t.co/dCMA90L72V","display_text_range":[0,116],"entities":{"urls":[{"url":"https://t.co/dCMA90L72V","expanded_url":"https://twitter.com/CJHopkins_Z23/status/1442092399358930946","display_url":"twitter.com/CJHopkins_Z23/…","indices":[93,116]}]},"source":"<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>","in_reply_to_status_id_str":"1335678942020300802","in_reply_to_user_id_str":"44067298","in_reply_to_screen_name":"michaelmalice","user_id_str":"44067298","is_quote_status":true,"quoted_status_id_str":"1442092399358930946","quoted_status_permalink":{"url":"https://t.co/dCMA90L72V","expanded":"https://twitter.com/CJHopkins_Z23/status/1442092399358930946","display":"twitter.com/CJHopkins_Z23/…"},"retweet_count":36,"favorite_count":386,"reply_count":12,"quote_count":0,"conversation_id_str":"1335381311255683072","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1335381311255683072"}}
--- a/scraper/tweet.go
+++ b/scraper/tweet.go
@ -3,6 +3,7 @@ package scraper
 import (
 	"time"
 	"fmt"
+	"strings"

 	"offline_twitter/terminal_utils"
 )
@ -23,12 +24,13 @@ type Tweet struct {
 	NumQuoteTweets int
 	InReplyTo      TweetID

-	Urls        []Url
-	Images      []Image
-	Videos      []Video
-	Mentions    []UserHandle
-	Hashtags    []string
-	QuotedTweet TweetID
+	Urls          []Url
+	Images        []Image
+	Videos        []Video
+	Mentions      []UserHandle
+	ReplyMentions []UserHandle
+	Hashtags      []string
+	QuotedTweet   TweetID

 	IsContentDownloaded bool
 }
@ -115,6 +117,15 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
 		ret.Mentions = append(ret.Mentions, UserHandle(mention.UserName))
 	}

+	for _, mention := range strings.Split(apiTweet.Entities.ReplyMentions, " ") {
+		if mention != "" {
+			if mention[0] != '@' {
+				panic(fmt.Sprintf("Unknown ReplyMention value: %s", apiTweet.Entities.ReplyMentions))
+			}
+			ret.ReplyMentions = append(ret.ReplyMentions, UserHandle(mention[1:]))
+		}
+	}
+
 	ret.QuotedTweet = TweetID(apiTweet.QuotedStatusID)

 	for _, entity := range apiTweet.ExtendedEntities.Media {
--- a/scraper/tweet_test.go
+++ b/scraper/tweet_test.go
@ -223,6 +223,31 @@ func TestParseTweetWithMultipleUrls(t *testing.T) {
 	}
 }

+func TestTweetWithLotsOfReplyMentions(t *testing.T) {
+	data, err := ioutil.ReadFile("test_responses/single_tweets/tweet_with_at_mentions_in_front.json")
+	if err != nil {
+		panic(err)
+	}
+	var apitweet scraper.APITweet
+	err = json.Unmarshal(data, &apitweet)
+	if err != nil {
+		t.Errorf(err.Error())
+	}
+	tweet, err := scraper.ParseSingleTweet(apitweet)
+	if err != nil {
+		t.Errorf(err.Error())
+	}
+
+	if len(tweet.ReplyMentions) != 4 {
+		t.Errorf("Expected %d reply-mentions, got %d", 4, len(tweet.ReplyMentions))
+	}
+	for i, v := range []scraper.UserHandle{"rob_mose", "primalpoly", "jmasseypoet", "SpaceX"} {
+		if tweet.ReplyMentions[i] != v {
+			t.Errorf("Expected %q, got %q at position %d", v, tweet.ReplyMentions[i], i)
+		}
+	}
+}
+

 func TestParseTweetResponse(t *testing.T) {
 	data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")
				`@ -0,0 +1 @@`
				{"created_at":"Sun Sep 26 19:32:46 +0000 2021","id_str":"1442210557046792199","full_text":"I was using an analogy about creating out-groups but the Germans sure love their literalism\n\nhttps://t.co/dCMA90L72V","display_text_range":[0,116],"entities":{"urls":[{"url":"https://t.co/dCMA90L72V","expanded_url":"https://twitter.com/CJHopkins_Z23/status/1442092399358930946","display_url":"twitter.com/CJHopkins_Z23/…","indices":[93,116]}]},"source":"<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>","in_reply_to_status_id_str":"1335678942020300802","in_reply_to_user_id_str":"44067298","in_reply_to_screen_name":"michaelmalice","user_id_str":"44067298","is_quote_status":true,"quoted_status_id_str":"1442092399358930946","quoted_status_permalink":{"url":"https://t.co/dCMA90L72V","expanded":"https://twitter.com/CJHopkins_Z23/status/1442092399358930946","display":"twitter.com/CJHopkins_Z23/…"},"retweet_count":36,"favorite_count":386,"reply_count":12,"quote_count":0,"conversation_id_str":"1335381311255683072","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1335381311255683072"}}