Escape HTML entities in tweet text

This commit is contained in:
Alessio 2022-01-05 21:04:40 -05:00
parent 8af6b6d3a6
commit bf2dbede94
3 changed files with 23 additions and 0 deletions

View File

@ -2,6 +2,7 @@ package scraper
import (
"fmt"
"html"
"time"
"strings"
"encoding/json"
@ -194,6 +195,7 @@ func (t *APITweet) NormalizeContent() {
}
}
}
t.FullText = html.UnescapeString(t.FullText)
t.FullText = strings.TrimSpace(t.FullText)
}

View File

@ -27,6 +27,7 @@ func TestNormalizeContent(t *testing.T) {
{"test_responses/single_tweets/tweet_with_quoted_tweet_as_link.json", "", 1422680899670274048, 0, 0, ""},
{"test_responses/single_tweets/tweet_with_quoted_tweet_as_link2.json", "sometimes they're too dimwitted to even get the wrong title right", 1396194494710788100, 1395882872729477131, 0, ""},
{"test_responses/single_tweets/tweet_with_quoted_tweet_as_link3.json", "I was using an analogy about creating out-groups but the Germans sure love their literalism", 1442092399358930946, 1335678942020300802, 0, ""},
{"test_responses/single_tweets/tweet_with_html_entities.json", "By the 1970s the elite consensus was that \"the hunt for atomic spies\" had been a grotesque over-reaction to minor leaks that cost the lives of the Rosenbergs & ruined many innocents. Only when the USSR fell was it discovered that they & other spies had given away ALL the secrets", 0, 0, 0, ""},
}
for _, v := range test_cases {

View File

@ -0,0 +1,20 @@
{
"created_at": "Thu Dec 23 20:55:48 +0000 2021",
"id_str": "1474121585510563845",
"full_text": "By the 1970s the elite consensus was that \"the hunt for atomic spies\" had been a grotesque over-reaction to minor leaks that cost the lives of the Rosenbergs & ruined many innocents. Only when the USSR fell was it discovered that they & other spies had given away ALL the secrets",
"display_text_range":
[
0,
288
],
"entities":
{},
"source": "<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>",
"user_id_str": "1239676915386068993",
"retweet_count": 239,
"favorite_count": 1118,
"reply_count": 26,
"quote_count": 26,
"conversation_id_str": "1474121585510563845",
"lang": "en"
}