diff --git a/cmd/tests.sh b/cmd/tests.sh
index 4316f5e..8a1b768 100755
--- a/cmd/tests.sh
+++ b/cmd/tests.sh
@@ -114,4 +114,20 @@ tw fetch_user HbdNrx
test $(sqlite3 twitter.db "select is_private from users where handle = 'HbdNrx'") = "1"
+# Test tweets with URLs
+urls_count=$(sqlite3 twitter.db "select count(*) from urls")
+tw fetch_tweet https://twitter.com/CovfefeAnon/status/1428904664645394433
+urls_count_after=$(sqlite3 twitter.db "select count(*) from urls")
+test $urls_count_after = $(($urls_count + 1))
+test "$(sqlite3 twitter.db "select title from urls where tweet_id = 1428904664645394433")" = "Justice Department investigating Elon Musk's SpaceX following complaint of hiring discrimination"
+test $(sqlite3 twitter.db "select thumbnail_remote_url from urls where tweet_id = 1428904664645394433") = "https://pbs.twimg.com/card_img/1436430370946392064/WX1Rv2AJ?format=jpg&name=800x320_1"
+
+# Try to double-fetch it; shouldn't duplicate the URL
+tw fetch_tweet https://twitter.com/CovfefeAnon/status/1428904664645394433
+urls_count_after_2x=$(sqlite3 twitter.db "select count(*) from urls")
+test $urls_count_after_2x = $urls_count_after
+
+
+# TODO: Maybe this file should be broken up into multiple test scripts
+
echo -e "\033[32mAll tests passed. Finished successfully.\033[0m"
diff --git a/persistence/media_queries.go b/persistence/media_queries.go
index f997406..cfd6c6a 100644
--- a/persistence/media_queries.go
+++ b/persistence/media_queries.go
@@ -42,6 +42,22 @@ func (p Profile) SaveVideo(vid scraper.Video) error {
return err
}
+/**
+ * Save an Url
+ */
+func (p Profile) SaveUrl(url scraper.Url) error {
+ _, err := p.DB.Exec(`
+ insert into urls (tweet_id, domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded)
+ values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ on conflict do update
+ set is_content_downloaded=?
+ `,
+ url.TweetID, url.Domain, url.Text, url.Title, url.Description, url.CreatorID, url.SiteID, url.ThumbnailRemoteUrl, url.ThumbnailLocalPath, url.HasCard, url.IsContentDownloaded,
+ url.IsContentDownloaded,
+ )
+ return err
+}
+
/**
* Get the list of images for a tweet
*/
@@ -93,3 +109,28 @@ func (p Profile) GetVideosForTweet(t scraper.Tweet) (vids []scraper.Video, err e
}
return
}
+
+/**
+ * Get the list of Urls for a Tweet
+ */
+func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error) {
+ stmt, err := p.DB.Prepare("select domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded from urls where tweet_id=? order by rowid")
+ if err != nil {
+ return
+ }
+ defer stmt.Close()
+ rows, err := stmt.Query(t.ID)
+ if err != nil {
+ return
+ }
+ var url scraper.Url
+ for rows.Next() {
+ err = rows.Scan(&url.Domain, &url.Text, &url.Title, &url.Description, &url.CreatorID, &url.SiteID, &url.ThumbnailRemoteUrl, &url.ThumbnailLocalPath, &url.HasCard, &url.IsContentDownloaded)
+ if err != nil {
+ return
+ }
+ url.TweetID = t.ID
+ urls = append(urls, url)
+ }
+ return
+}
diff --git a/persistence/media_queries_test.go b/persistence/media_queries_test.go
index 49b6665..440dcdd 100644
--- a/persistence/media_queries_test.go
+++ b/persistence/media_queries_test.go
@@ -165,3 +165,81 @@ func TestModifyVideo(t *testing.T) {
t.Error(diff)
}
}
+
+
+/**
+ * Create an Url, save it, reload it, and make sure it comes back the same
+ */
+func TestSaveAndLoadUrl(t *testing.T) {
+ profile_path := "test_profiles/TestMediaQueries"
+ profile := create_or_load_profile(profile_path)
+
+ tweet := create_stable_tweet()
+
+ // Create a fresh Url to test on
+ rand.Seed(time.Now().UnixNano())
+ url := create_url_from_id(rand.Int())
+ url.TweetID = tweet.ID
+
+ // Save the Url
+ err := profile.SaveUrl(url)
+ if err != nil {
+ t.Fatalf("Failed to save the url: %s", err.Error())
+ }
+
+ // Reload the Url
+ urls, err := profile.GetUrlsForTweet(tweet)
+ if err != nil {
+ t.Fatalf("Could not load urls: %s", err.Error())
+ }
+
+ var new_url scraper.Url
+ for index := range urls {
+ if urls[index].Text == url.Text {
+ new_url = urls[index]
+ }
+ }
+ if new_url.Text != url.Text {
+ t.Fatalf("Could not find url for some reason: %s, %s; %+v", new_url.Text, url.Text, urls)
+ }
+ if diff := deep.Equal(url, new_url); diff != nil {
+ t.Error(diff)
+ }
+}
+
+/**
+ * Change an Url, save the changes, reload it, and check if it comes back the same
+ */
+func TestModifyUrl(t *testing.T) {
+ profile_path := "test_profiles/TestMediaQueries"
+ profile := create_or_load_profile(profile_path)
+
+ tweet := create_stable_tweet()
+ url := tweet.Urls[0]
+
+ if url.Text != "-1text" {
+ t.Fatalf("Got the wrong url back: wanted %s, got %s!", "-1text", url.Text)
+ }
+
+ url.IsContentDownloaded = true
+
+ // Save the changes
+ err := profile.SaveUrl(url)
+ if err != nil {
+ t.Error(err)
+ }
+
+ // Reload it
+ urls, err := profile.GetUrlsForTweet(tweet)
+ if err != nil {
+ t.Fatalf("Could not load urls: %s", err.Error())
+ }
+ new_url := urls[0]
+ if new_url.Text != "-1text" {
+ t.Fatalf("Got the wrong url back: wanted %s, got %s!", "-1text", new_url.Text)
+ }
+
+ if diff := deep.Equal(url, new_url); diff != nil {
+ t.Error(diff)
+ }
+}
diff --git a/persistence/schema.sql b/persistence/schema.sql
index ffad3ce..5a59e19 100644
--- a/persistence/schema.sql
+++ b/persistence/schema.sql
@@ -54,10 +54,21 @@ create table retweets(rowid integer primary key,
create table urls (rowid integer primary key,
tweet_id integer not null,
+ domain text,
text text not null,
+ title text,
+ description text,
+ creator_id integer,
+ site_id integer,
+ thumbnail_remote_url text,
+ thumbnail_local_path text,
+ has_card boolean,
+ is_content_downloaded boolean default 0,
unique (tweet_id, text)
foreign key(tweet_id) references tweets(id)
+ -- foreign key(creator_id) references users(id)
+ -- foreign key(site_id) references users(id)
);
create table images (rowid integer primary key,
diff --git a/persistence/tweet_queries.go b/persistence/tweet_queries.go
index 82f52c2..a050fb7 100644
--- a/persistence/tweet_queries.go
+++ b/persistence/tweet_queries.go
@@ -1,7 +1,6 @@
package persistence
import (
- "fmt"
"time"
"strings"
"database/sql"
@@ -34,7 +33,7 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
return err
}
for _, url := range t.Urls {
- _, err := db.Exec("insert into urls (tweet_id, text) values (?, ?) on conflict do nothing", t.ID, url)
+ err := p.SaveUrl(url)
if err != nil {
return err
}
@@ -80,29 +79,6 @@ func (p Profile) IsTweetInDatabase(id scraper.TweetID) bool {
return true
}
-func (p Profile) attach_urls(t *scraper.Tweet) error {
- println("Attaching urls")
- stmt, err := p.DB.Prepare("select text from urls where tweet_id = ?")
- if err != nil {
- return err
- }
- defer stmt.Close()
- rows, err := stmt.Query(t.ID)
- if err != nil {
- return err
- }
- var url string
- for rows.Next() {
- err = rows.Scan(&url)
- if err != nil {
- return err
- }
- t.Urls = append(t.Urls, url)
- fmt.Printf("%v\n", t.Urls)
- }
- return nil
-}
-
func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
db := p.DB
@@ -146,7 +122,9 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
}
t.Videos = vids
- err = p.attach_urls(&t)
+ urls, err := p.GetUrlsForTweet(t)
+ t.Urls = urls
+
return t, err
}
diff --git a/persistence/tweet_queries_test.go b/persistence/tweet_queries_test.go
index 5dc6e81..eb44df5 100644
--- a/persistence/tweet_queries_test.go
+++ b/persistence/tweet_queries_test.go
@@ -29,12 +29,6 @@ func TestSaveAndLoadTweet(t *testing.T) {
t.Fatalf("Failed to load the tweet: %s", err.Error())
}
- if diff := deep.Equal(tweet.Images, new_tweet.Images); diff != nil {
- t.Error(diff)
- }
- if diff := deep.Equal(tweet.Videos, new_tweet.Videos); diff != nil {
- t.Error(diff)
- }
if diff := deep.Equal(tweet, new_tweet); diff != nil {
t.Error(diff)
}
diff --git a/persistence/utils_test.go b/persistence/utils_test.go
index 3bb24af..55b9133 100644
--- a/persistence/utils_test.go
+++ b/persistence/utils_test.go
@@ -92,6 +92,26 @@ func create_video_from_id(id int) scraper.Video {
}
}
+/**
+ * Create a semi-stable Url based on the given ID
+ */
+func create_url_from_id(id int) scraper.Url {
+ s := fmt.Sprint(id)
+ return scraper.Url {
+ TweetID: -1,
+ Domain: s + "domain",
+ Text: s + "text",
+ Title: s + "title",
+ Description: s + "description",
+ ThumbnailRemoteUrl: s + "remote url",
+ ThumbnailLocalPath: s + "local path",
+ CreatorID: scraper.UserID(id),
+ SiteID: scraper.UserID(id),
+ HasCard: true,
+ IsContentDownloaded: false,
+ }
+}
+
/**
* Create a stable tweet with a fixed ID and content
*/
@@ -109,7 +129,9 @@ func create_stable_tweet() scraper.Tweet {
Videos: []scraper.Video{
create_video_from_id(-1),
},
- Urls: []string{},
+ Urls: []scraper.Url{
+ create_url_from_id(-1),
+ },
Images: []scraper.Image{
create_image_from_id(-1),
},
@@ -173,6 +195,11 @@ func create_dummy_tweet() scraper.Tweet {
vid := create_video_from_id(rand.Int())
vid.TweetID = tweet_id
+ url1 := create_url_from_id(rand.Int())
+ url1.TweetID = tweet_id
+ url2 := create_url_from_id(rand.Int())
+ url2.TweetID = tweet_id
+
return scraper.Tweet{
ID: tweet_id,
UserID: -1,
@@ -183,7 +210,7 @@ func create_dummy_tweet() scraper.Tweet {
NumReplies: 3,
NumQuoteTweets: 4,
Videos: []scraper.Video{vid},
- Urls: []string{"url1", "url2"},
+ Urls: []scraper.Url{url1, url2},
Images: []scraper.Image{img1, img2},
Mentions: []scraper.UserHandle{"mention1", "mention2"},
Hashtags: []string{"hash1", "hash2"},
diff --git a/scraper/test_responses/tweet_with_url_but_no_card.json b/scraper/test_responses/tweet_with_url_but_no_card.json
new file mode 100644
index 0000000..e520d8b
--- /dev/null
+++ b/scraper/test_responses/tweet_with_url_but_no_card.json
@@ -0,0 +1 @@
+{"created_at":"Fri Sep 17 00:03:26 +0000 2021","id_str":"1438654793384353793","full_text":"NEW: columnist Jennifer Rubin was one of the Obama administration’s most reactionary critics\n\nNow she’s the Biden admin’s favorite columnist\nW/ @NickNiedz\n\nWe reached out to Rubin her columns and divisions at Wapo over them\n\nHer response—>\nhttps://t.co/ZigZyLctwt https://t.co/KZZAK1tXhq","display_text_range":[0,266],"entities":{"user_mentions":[{"screen_name":"NickNiedz","name":"Nick Niedzwiadek","id_str":"548501303","indices":[144,154]}],"urls":[{"url":"https://t.co/ZigZyLctwt","expanded_url":"https://www.politico.com/newsletters/west-wing-playbook/2021/09/16/the-jennifer-rubin-wh-symbiosis-494364","display_url":"politico.com/newsletters/we…","indices":[243,266]}],"media":[{"id_str":"1438654789596942336","indices":[267,290],"media_url":"http://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","media_url_https":"https://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","url":"https://t.co/KZZAK1tXhq","display_url":"pic.twitter.com/KZZAK1tXhq","expanded_url":"https://twitter.com/AlexThomp/status/1438654793384353793/photo/1","type":"photo","original_info":{"width":1170,"height":1809,"focus_rects":[{"x":0,"y":0,"h":655,"w":1170},{"x":0,"y":0,"h":1170,"w":1170},{"x":0,"y":0,"h":1334,"w":1170},{"x":45,"y":0,"h":1809,"w":905},{"x":0,"y":0,"h":1809,"w":1170}]},"sizes":{"small":{"w":440,"h":680,"resize":"fit"},"medium":{"w":776,"h":1200,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1170,"h":1809,"resize":"fit"}}}]},"extended_entities":{"media":[{"id_str":"1438654789596942336","indices":[267,290],"media_url":"http://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","media_url_https":"https://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","url":"https://t.co/KZZAK1tXhq","display_url":"pic.twitter.com/KZZAK1tXhq","expanded_url":"https://twitter.com/AlexThomp/status/1438654793384353793/photo/1","type":"photo","original_info":{"width":1170,"height":1809,"focus_rects":[{"x":0,"y":0,"h":655,"w":1170},{"x":0,"y":0,"h":1170,"w":1170},{"x":0,"y":0,"h":1334,"w":1170},{"x":45,"y":0,"h":1809,"w":905},{"x":0,"y":0,"h":1809,"w":1170}]},"sizes":{"small":{"w":440,"h":680,"resize":"fit"},"medium":{"w":776,"h":1200,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1170,"h":1809,"resize":"fit"}},"media_key":"3_1438654789596942336","ext_alt_text":null,"ext_media_availability":{"status":"available"},"ext_media_color":{"palette":[{"rgb":{"red":252,"green":252,"blue":252},"percentage":99.67},{"rgb":{"red":145,"green":145,"blue":145},"percentage":0.33}]},"ext":{"mediaStats":{"r":"Missing","ttl":-1}}}]},"source":"Twitter for iPhone","user_id_str":"370982639","retweet_count":407,"favorite_count":1802,"reply_count":2642,"quote_count":884,"conversation_id_str":"1438654793384353793","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1438654793384353793"}}
diff --git a/scraper/test_responses/tweet_with_url_card.json b/scraper/test_responses/tweet_with_url_card.json
new file mode 100644
index 0000000..addb4f7
--- /dev/null
+++ b/scraper/test_responses/tweet_with_url_card.json
@@ -0,0 +1 @@
+{"created_at":"Mon Aug 30 22:48:51 +0000 2021","id_str":"1432475431525969920","full_text":"\"It's OK that our babies may not have learned all their times tables. They know the difference between a riot and a protest. They know the words insurrection and coup.\" - LA Teacher’s Union \n\nhttps://t.co/Y1lWjNEiPK","display_text_range":[0,215],"entities":{"urls":[{"url":"https://t.co/Y1lWjNEiPK","expanded_url":"https://reason.com/2021/08/30/la-teachers-union-cecily-myart-cruz-learning-loss/","display_url":"reason.com/2021/08/30/la-…","indices":[192,215]}]},"source":"Twitter for iPhone","user_id_str":"358545917","retweet_count":146,"favorite_count":557,"reply_count":64,"quote_count":25,"conversation_id_str":"1432475431525969920","possibly_sensitive_editable":true,"card":{"name":"summary_large_image","url":"https://t.co/Y1lWjNEiPK","card_type_url":"http://card-type-url-is-deprecated.invalid","binding_values":{"vanity_url":{"type":"STRING","string_value":"reason.com","scribe_key":"vanity_url"},"amp":{"type":"BOOLEAN","boolean_value":true},"domain":{"type":"STRING","string_value":"reason.com"},"creator":{"type":"USER","user_value":{"id_str":"155581583","path":[]}},"site":{"type":"USER","user_value":{"id_str":"16467567","path":[]},"scribe_key":"publisher_id"},"title":{"type":"STRING","string_value":"L.A. Teachers Union Leader: 'There's No Such Thing As Learning Loss'"},"description":{"type":"STRING","string_value":"\"It’s OK that our babies may not have learned all their times tables,\" says Cecily Myart-Cruz. \"They learned resilience.\""},"thumbnail_image_small":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=144x144","width":144,"height":76}},"thumbnail_image":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=280x150","width":280,"height":147}},"thumbnail_image_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x600","width":600,"height":315}},"thumbnail_image_x_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=png&name=2048x2048_2_exp","width":1200,"height":630}},"thumbnail_image_original":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=orig","width":1200,"height":630}},"summary_photo_image_small":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=386x202","width":386,"height":202}},"summary_photo_image":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x314","width":600,"height":314}},"summary_photo_image_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=800x419","width":800,"height":419}},"summary_photo_image_x_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=png&name=2048x2048_2_exp","width":1200,"height":630}},"summary_photo_image_original":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=orig","width":1200,"height":630}},"photo_image_full_size_small":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=386x202","width":386,"height":202}},"photo_image_full_size":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x314","width":600,"height":314}},"photo_image_full_size_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=800x419","width":800,"height":419}},"photo_image_full_size_x_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=png&name=2048x2048_2_exp","width":1200,"height":630}},"photo_image_full_size_original":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=orig","width":1200,"height":630}},"card_url":{"type":"STRING","string_value":"https://t.co/Y1lWjNEiPK","scribe_key":"card_url"}},"users":{"155581583":{"id_str":"155581583","name":"Robby Soave","screen_name":"robbysoave","location":"","description":"@reason senior editor and \"noted scooter anthropologist.\" Author of \"Panic Attack\" and \"Tech Panic.\" Pre-order now: https://t.co/Mb5iBIZhcP","entities":{"description":{"urls":[{"url":"https://t.co/Mb5iBIZhcP","expanded_url":"http://tinyurl.com/rt2zftny","display_url":"tinyurl.com/rt2zftny","indices":[116,139]}]}},"followers_count":81373,"fast_followers_count":0,"normal_followers_count":81373,"friends_count":1807,"listed_count":979,"created_at":"Mon Jun 14 14:45:39 +0000 2010","favourites_count":7920,"geo_enabled":true,"verified":true,"statuses_count":12184,"media_count":1139,"profile_image_url_https":"https://pbs.twimg.com/profile_images/1423673056371871744/NKzapFP-_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/155581583/1628264486","profile_image_extensions_alt_text":null,"profile_image_extensions_media_availability":null,"profile_image_extensions_media_color":{"palette":[{"rgb":{"red":188,"green":188,"blue":187},"percentage":54.64},{"rgb":{"red":40,"green":38,"blue":34},"percentage":19.13},{"rgb":{"red":210,"green":152,"blue":114},"percentage":15.07},{"rgb":{"red":105,"green":68,"blue":43},"percentage":8.59},{"rgb":{"red":182,"green":104,"blue":84},"percentage":0.36}]},"profile_image_extensions":{"mediaStats":{"r":{"missing":null},"ttl":-1}},"profile_banner_extensions_alt_text":null,"profile_banner_extensions_media_availability":null,"profile_banner_extensions_media_color":{"palette":[{"rgb":{"red":155,"green":28,"blue":40},"percentage":80.27},{"rgb":{"red":188,"green":133,"blue":139},"percentage":9.65},{"rgb":{"red":236,"green":44,"blue":58},"percentage":8.05},{"rgb":{"red":208,"green":175,"blue":178},"percentage":1.14},{"rgb":{"red":31,"green":27,"blue":27},"percentage":0.4}]},"profile_banner_extensions":{"mediaStats":{"r":{"missing":null},"ttl":-1}},"profile_link_color":"0084B4","has_extended_profile":true,"pinned_tweet_ids":[1435296810638233602],"pinned_tweet_ids_str":["1435296810638233602"],"has_custom_timelines":true,"advertiser_account_type":"promotable_user","advertiser_account_service_levels":[],"profile_interstitial_type":"","business_profile_state":"none","translator_type":"none","withheld_in_countries":[],"ext":{"highlightedLabel":{"r":{"ok":{}},"ttl":-1}}},"16467567":{"id_str":"16467567","name":"reason","screen_name":"reason","location":"Washington, DC and Los Angeles","description":"Reason is the monthly magazine and website of “free minds and free markets” published by @ReasonFdn.","url":"https://t.co/W2vfk5WIWy","entities":{"url":{"urls":[{"url":"https://t.co/W2vfk5WIWy","expanded_url":"http://reason.com","display_url":"reason.com","indices":[0,23]}]},"description":{}},"followers_count":265717,"fast_followers_count":0,"normal_followers_count":265717,"friends_count":365,"listed_count":6456,"created_at":"Fri Sep 26 13:31:17 +0000 2008","favourites_count":208,"verified":true,"statuses_count":91534,"media_count":2538,"profile_image_url_https":"https://pbs.twimg.com/profile_images/943872166101000192/JIdyYi7P_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/16467567/1629580544","profile_image_extensions_alt_text":null,"profile_image_extensions_media_availability":null,"profile_image_extensions_media_color":{"palette":[{"rgb":{"red":242,"green":107,"blue":52},"percentage":92.62},{"rgb":{"red":255,"green":255,"blue":255},"percentage":7.15},{"rgb":{"red":249,"green":193,"blue":168},"percentage":0.15}]},"profile_image_extensions":{"mediaStats":{"r":{"missing":null},"ttl":-1}},"profile_banner_extensions_alt_text":null,"profile_banner_extensions_media_availability":null,"profile_banner_extensions_media_color":{"palette":[{"rgb":{"red":229,"green":179,"blue":1},"percentage":46.93},{"rgb":{"red":11,"green":9,"blue":4},"percentage":19.06},{"rgb":{"red":241,"green":222,"blue":87},"percentage":8.48},{"rgb":{"red":144,"green":115,"blue":28},"percentage":8.16},{"rgb":{"red":238,"green":151,"blue":3},"percentage":5.56}]},"profile_banner_extensions":{"mediaStats":{"r":{"missing":null},"ttl":-1}},"profile_link_color":"FF6C2F","pinned_tweet_ids":[],"pinned_tweet_ids_str":[],"advertiser_account_type":"promotable_user","advertiser_account_service_levels":["smb","media_studio"],"profile_interstitial_type":"","business_profile_state":"none","translator_type":"none","withheld_in_countries":[],"ext":{"highlightedLabel":{"r":{"ok":{}},"ttl":-1}}}},"card_platform":{"platform":{"device":{"name":"Swift","version":"12"},"audience":{"name":"production"}}}},"lang":"en"}
diff --git a/scraper/tweet.go b/scraper/tweet.go
index 2a0ec04..20299fd 100644
--- a/scraper/tweet.go
+++ b/scraper/tweet.go
@@ -23,7 +23,7 @@ type Tweet struct {
NumQuoteTweets int
InReplyTo TweetID
- Urls []string
+ Urls []Url
Images []Image
Videos []Video
Mentions []UserHandle
@@ -63,7 +63,7 @@ Replies: %d RT: %d QT: %d Likes: %d
if len(t.Urls) > 0 {
ret += "urls: [\n"
for _, url := range(t.Urls) {
- ret += " " + url + "\n"
+ ret += " " + url.Text + "\n"
}
ret += "]"
}
@@ -89,8 +89,18 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
ret.NumQuoteTweets = apiTweet.QuoteCount
ret.InReplyTo = TweetID(apiTweet.InReplyToStatusID)
- for _, url := range apiTweet.Entities.URLs {
- ret.Urls = append(ret.Urls, url.ExpandedURL)
+ for i, url := range apiTweet.Entities.URLs {
+ if i != 0 {
+ panic(fmt.Sprintf("Tweet with multiple embedded URLs: %d", apiTweet.ID))
+ }
+ var url_object Url
+ if apiTweet.Card.BindingValues.Domain.Value != "" {
+ // Using the "Domain" field to detect if there is a card
+ url_object = ParseAPIUrlCard(apiTweet.Card)
+ }
+ url_object.Text = url.ExpandedURL
+ url_object.TweetID = ret.ID
+ ret.Urls = append(ret.Urls, url_object)
}
for _, media := range apiTweet.Entities.Media {
if media.Type != "photo" { // TODO: remove this eventually
diff --git a/scraper/tweet_test.go b/scraper/tweet_test.go
index 834d8fc..4120ae5 100644
--- a/scraper/tweet_test.go
+++ b/scraper/tweet_test.go
@@ -52,6 +52,10 @@ func TestParseSingleTweet(t *testing.T) {
t.Errorf("Expected %v, got %v", []string{"michaelmalice"}, tweet.Mentions)
}
+ if len(tweet.Urls) != 0 {
+ t.Errorf("Expected %d urls, but got %d", 0, len(tweet.Urls))
+ }
+
if tweet.PostedAt.Unix() != 1621639105 {
t.Errorf("Expected %d, got %d", 1621639105, tweet.PostedAt.Unix())
}
@@ -162,6 +166,66 @@ func TestParseTweetWithVideo(t *testing.T) {
}
}
+func TestParseTweetWithUrl(t *testing.T) {
+ data, err := ioutil.ReadFile("test_responses/tweet_with_url_card.json")
+ if err != nil {
+ panic(err)
+ }
+ var apitweet scraper.APITweet
+ err = json.Unmarshal(data, &apitweet)
+ if err != nil {
+ t.Errorf(err.Error())
+ }
+ tweet, err := scraper.ParseSingleTweet(apitweet)
+ if err != nil {
+ t.Errorf(err.Error())
+ }
+
+ if len(tweet.Urls) != 1 {
+ t.Errorf("Expected %d urls, but got %d", 1, len(tweet.Urls))
+ }
+
+ expected_url_text := "https://reason.com/2021/08/30/la-teachers-union-cecily-myart-cruz-learning-loss/"
+ if tweet.Urls[0].Text != expected_url_text {
+ t.Errorf("Expected Url text to be %q, but got %q", expected_url_text, tweet.Urls[0].Text)
+ }
+ if !tweet.Urls[0].HasCard {
+ t.Errorf("Expected it to have a card, but it doesn't")
+ }
+ expected_url_domain := "reason.com"
+ if tweet.Urls[0].Domain != expected_url_domain {
+ t.Errorf("Expected Url text to be %q, but got %q", expected_url_domain, tweet.Urls[0].Domain)
+ }
+}
+
+func TestParseTweetWithUrlButNoCard(t *testing.T) {
+ data, err := ioutil.ReadFile("test_responses/tweet_with_url_but_no_card.json")
+ if err != nil {
+ panic(err)
+ }
+ var apitweet scraper.APITweet
+ err = json.Unmarshal(data, &apitweet)
+ if err != nil {
+ t.Errorf(err.Error())
+ }
+ tweet, err := scraper.ParseSingleTweet(apitweet)
+ if err != nil {
+ t.Errorf(err.Error())
+ }
+
+ if len(tweet.Urls) != 1 {
+ t.Errorf("Expected %d urls, but got %d", 1, len(tweet.Urls))
+ }
+
+ expected_url_text := "https://www.politico.com/newsletters/west-wing-playbook/2021/09/16/the-jennifer-rubin-wh-symbiosis-494364"
+ if tweet.Urls[0].Text != expected_url_text {
+ t.Errorf("Expected Url text to be %q, but got %q", expected_url_text, tweet.Urls[0].Text)
+ }
+ if tweet.Urls[0].HasCard {
+ t.Errorf("Expected url not to have a card, but it thinks it has one")
+ }
+}
+
func TestParseTweetResponse(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")
if err != nil {
diff --git a/scraper/url.go b/scraper/url.go
index ffe1b3a..3c5f9e6 100644
--- a/scraper/url.go
+++ b/scraper/url.go
@@ -7,6 +7,8 @@ import (
)
type Url struct {
+ TweetID TweetID
+
Domain string
Text string
Title string