diff --git a/cmd/tests.sh b/cmd/tests.sh index 4316f5e..8a1b768 100755 --- a/cmd/tests.sh +++ b/cmd/tests.sh @@ -114,4 +114,20 @@ tw fetch_user HbdNrx test $(sqlite3 twitter.db "select is_private from users where handle = 'HbdNrx'") = "1" +# Test tweets with URLs +urls_count=$(sqlite3 twitter.db "select count(*) from urls") +tw fetch_tweet https://twitter.com/CovfefeAnon/status/1428904664645394433 +urls_count_after=$(sqlite3 twitter.db "select count(*) from urls") +test $urls_count_after = $(($urls_count + 1)) +test "$(sqlite3 twitter.db "select title from urls where tweet_id = 1428904664645394433")" = "Justice Department investigating Elon Musk's SpaceX following complaint of hiring discrimination" +test $(sqlite3 twitter.db "select thumbnail_remote_url from urls where tweet_id = 1428904664645394433") = "https://pbs.twimg.com/card_img/1436430370946392064/WX1Rv2AJ?format=jpg&name=800x320_1" + +# Try to double-fetch it; shouldn't duplicate the URL +tw fetch_tweet https://twitter.com/CovfefeAnon/status/1428904664645394433 +urls_count_after_2x=$(sqlite3 twitter.db "select count(*) from urls") +test $urls_count_after_2x = $urls_count_after + + +# TODO: Maybe this file should be broken up into multiple test scripts + echo -e "\033[32mAll tests passed. Finished successfully.\033[0m" diff --git a/persistence/media_queries.go b/persistence/media_queries.go index f997406..cfd6c6a 100644 --- a/persistence/media_queries.go +++ b/persistence/media_queries.go @@ -42,6 +42,22 @@ func (p Profile) SaveVideo(vid scraper.Video) error { return err } +/** + * Save an Url + */ +func (p Profile) SaveUrl(url scraper.Url) error { + _, err := p.DB.Exec(` + insert into urls (tweet_id, domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict do update + set is_content_downloaded=? + `, + url.TweetID, url.Domain, url.Text, url.Title, url.Description, url.CreatorID, url.SiteID, url.ThumbnailRemoteUrl, url.ThumbnailLocalPath, url.HasCard, url.IsContentDownloaded, + url.IsContentDownloaded, + ) + return err +} + /** * Get the list of images for a tweet */ @@ -93,3 +109,28 @@ func (p Profile) GetVideosForTweet(t scraper.Tweet) (vids []scraper.Video, err e } return } + +/** + * Get the list of Urls for a Tweet + */ +func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error) { + stmt, err := p.DB.Prepare("select domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded from urls where tweet_id=? order by rowid") + if err != nil { + return + } + defer stmt.Close() + rows, err := stmt.Query(t.ID) + if err != nil { + return + } + var url scraper.Url + for rows.Next() { + err = rows.Scan(&url.Domain, &url.Text, &url.Title, &url.Description, &url.CreatorID, &url.SiteID, &url.ThumbnailRemoteUrl, &url.ThumbnailLocalPath, &url.HasCard, &url.IsContentDownloaded) + if err != nil { + return + } + url.TweetID = t.ID + urls = append(urls, url) + } + return +} diff --git a/persistence/media_queries_test.go b/persistence/media_queries_test.go index 49b6665..440dcdd 100644 --- a/persistence/media_queries_test.go +++ b/persistence/media_queries_test.go @@ -165,3 +165,81 @@ func TestModifyVideo(t *testing.T) { t.Error(diff) } } + + +/** + * Create an Url, save it, reload it, and make sure it comes back the same + */ +func TestSaveAndLoadUrl(t *testing.T) { + profile_path := "test_profiles/TestMediaQueries" + profile := create_or_load_profile(profile_path) + + tweet := create_stable_tweet() + + // Create a fresh Url to test on + rand.Seed(time.Now().UnixNano()) + url := create_url_from_id(rand.Int()) + url.TweetID = tweet.ID + + // Save the Url + err := profile.SaveUrl(url) + if err != nil { + t.Fatalf("Failed to save the url: %s", err.Error()) + } + + // Reload the Url + urls, err := profile.GetUrlsForTweet(tweet) + if err != nil { + t.Fatalf("Could not load urls: %s", err.Error()) + } + + var new_url scraper.Url + for index := range urls { + if urls[index].Text == url.Text { + new_url = urls[index] + } + } + if new_url.Text != url.Text { + t.Fatalf("Could not find url for some reason: %s, %s; %+v", new_url.Text, url.Text, urls) + } + if diff := deep.Equal(url, new_url); diff != nil { + t.Error(diff) + } +} + +/** + * Change an Url, save the changes, reload it, and check if it comes back the same + */ +func TestModifyUrl(t *testing.T) { + profile_path := "test_profiles/TestMediaQueries" + profile := create_or_load_profile(profile_path) + + tweet := create_stable_tweet() + url := tweet.Urls[0] + + if url.Text != "-1text" { + t.Fatalf("Got the wrong url back: wanted %s, got %s!", "-1text", url.Text) + } + + url.IsContentDownloaded = true + + // Save the changes + err := profile.SaveUrl(url) + if err != nil { + t.Error(err) + } + + // Reload it + urls, err := profile.GetUrlsForTweet(tweet) + if err != nil { + t.Fatalf("Could not load urls: %s", err.Error()) + } + new_url := urls[0] + if new_url.Text != "-1text" { + t.Fatalf("Got the wrong url back: wanted %s, got %s!", "-1text", new_url.Text) + } + + if diff := deep.Equal(url, new_url); diff != nil { + t.Error(diff) + } +} diff --git a/persistence/schema.sql b/persistence/schema.sql index ffad3ce..5a59e19 100644 --- a/persistence/schema.sql +++ b/persistence/schema.sql @@ -54,10 +54,21 @@ create table retweets(rowid integer primary key, create table urls (rowid integer primary key, tweet_id integer not null, + domain text, text text not null, + title text, + description text, + creator_id integer, + site_id integer, + thumbnail_remote_url text, + thumbnail_local_path text, + has_card boolean, + is_content_downloaded boolean default 0, unique (tweet_id, text) foreign key(tweet_id) references tweets(id) + -- foreign key(creator_id) references users(id) + -- foreign key(site_id) references users(id) ); create table images (rowid integer primary key, diff --git a/persistence/tweet_queries.go b/persistence/tweet_queries.go index 82f52c2..a050fb7 100644 --- a/persistence/tweet_queries.go +++ b/persistence/tweet_queries.go @@ -1,7 +1,6 @@ package persistence import ( - "fmt" "time" "strings" "database/sql" @@ -34,7 +33,7 @@ func (p Profile) SaveTweet(t scraper.Tweet) error { return err } for _, url := range t.Urls { - _, err := db.Exec("insert into urls (tweet_id, text) values (?, ?) on conflict do nothing", t.ID, url) + err := p.SaveUrl(url) if err != nil { return err } @@ -80,29 +79,6 @@ func (p Profile) IsTweetInDatabase(id scraper.TweetID) bool { return true } -func (p Profile) attach_urls(t *scraper.Tweet) error { - println("Attaching urls") - stmt, err := p.DB.Prepare("select text from urls where tweet_id = ?") - if err != nil { - return err - } - defer stmt.Close() - rows, err := stmt.Query(t.ID) - if err != nil { - return err - } - var url string - for rows.Next() { - err = rows.Scan(&url) - if err != nil { - return err - } - t.Urls = append(t.Urls, url) - fmt.Printf("%v\n", t.Urls) - } - return nil -} - func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) { db := p.DB @@ -146,7 +122,9 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) { } t.Videos = vids - err = p.attach_urls(&t) + urls, err := p.GetUrlsForTweet(t) + t.Urls = urls + return t, err } diff --git a/persistence/tweet_queries_test.go b/persistence/tweet_queries_test.go index 5dc6e81..eb44df5 100644 --- a/persistence/tweet_queries_test.go +++ b/persistence/tweet_queries_test.go @@ -29,12 +29,6 @@ func TestSaveAndLoadTweet(t *testing.T) { t.Fatalf("Failed to load the tweet: %s", err.Error()) } - if diff := deep.Equal(tweet.Images, new_tweet.Images); diff != nil { - t.Error(diff) - } - if diff := deep.Equal(tweet.Videos, new_tweet.Videos); diff != nil { - t.Error(diff) - } if diff := deep.Equal(tweet, new_tweet); diff != nil { t.Error(diff) } diff --git a/persistence/utils_test.go b/persistence/utils_test.go index 3bb24af..55b9133 100644 --- a/persistence/utils_test.go +++ b/persistence/utils_test.go @@ -92,6 +92,26 @@ func create_video_from_id(id int) scraper.Video { } } +/** + * Create a semi-stable Url based on the given ID + */ +func create_url_from_id(id int) scraper.Url { + s := fmt.Sprint(id) + return scraper.Url { + TweetID: -1, + Domain: s + "domain", + Text: s + "text", + Title: s + "title", + Description: s + "description", + ThumbnailRemoteUrl: s + "remote url", + ThumbnailLocalPath: s + "local path", + CreatorID: scraper.UserID(id), + SiteID: scraper.UserID(id), + HasCard: true, + IsContentDownloaded: false, + } +} + /** * Create a stable tweet with a fixed ID and content */ @@ -109,7 +129,9 @@ func create_stable_tweet() scraper.Tweet { Videos: []scraper.Video{ create_video_from_id(-1), }, - Urls: []string{}, + Urls: []scraper.Url{ + create_url_from_id(-1), + }, Images: []scraper.Image{ create_image_from_id(-1), }, @@ -173,6 +195,11 @@ func create_dummy_tweet() scraper.Tweet { vid := create_video_from_id(rand.Int()) vid.TweetID = tweet_id + url1 := create_url_from_id(rand.Int()) + url1.TweetID = tweet_id + url2 := create_url_from_id(rand.Int()) + url2.TweetID = tweet_id + return scraper.Tweet{ ID: tweet_id, UserID: -1, @@ -183,7 +210,7 @@ func create_dummy_tweet() scraper.Tweet { NumReplies: 3, NumQuoteTweets: 4, Videos: []scraper.Video{vid}, - Urls: []string{"url1", "url2"}, + Urls: []scraper.Url{url1, url2}, Images: []scraper.Image{img1, img2}, Mentions: []scraper.UserHandle{"mention1", "mention2"}, Hashtags: []string{"hash1", "hash2"}, diff --git a/scraper/test_responses/tweet_with_url_but_no_card.json b/scraper/test_responses/tweet_with_url_but_no_card.json new file mode 100644 index 0000000..e520d8b --- /dev/null +++ b/scraper/test_responses/tweet_with_url_but_no_card.json @@ -0,0 +1 @@ +{"created_at":"Fri Sep 17 00:03:26 +0000 2021","id_str":"1438654793384353793","full_text":"NEW: columnist Jennifer Rubin was one of the Obama administration’s most reactionary critics\n\nNow she’s the Biden admin’s favorite columnist\nW/ @NickNiedz\n\nWe reached out to Rubin her columns and divisions at Wapo over them\n\nHer response—>\nhttps://t.co/ZigZyLctwt https://t.co/KZZAK1tXhq","display_text_range":[0,266],"entities":{"user_mentions":[{"screen_name":"NickNiedz","name":"Nick Niedzwiadek","id_str":"548501303","indices":[144,154]}],"urls":[{"url":"https://t.co/ZigZyLctwt","expanded_url":"https://www.politico.com/newsletters/west-wing-playbook/2021/09/16/the-jennifer-rubin-wh-symbiosis-494364","display_url":"politico.com/newsletters/we…","indices":[243,266]}],"media":[{"id_str":"1438654789596942336","indices":[267,290],"media_url":"http://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","media_url_https":"https://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","url":"https://t.co/KZZAK1tXhq","display_url":"pic.twitter.com/KZZAK1tXhq","expanded_url":"https://twitter.com/AlexThomp/status/1438654793384353793/photo/1","type":"photo","original_info":{"width":1170,"height":1809,"focus_rects":[{"x":0,"y":0,"h":655,"w":1170},{"x":0,"y":0,"h":1170,"w":1170},{"x":0,"y":0,"h":1334,"w":1170},{"x":45,"y":0,"h":1809,"w":905},{"x":0,"y":0,"h":1809,"w":1170}]},"sizes":{"small":{"w":440,"h":680,"resize":"fit"},"medium":{"w":776,"h":1200,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1170,"h":1809,"resize":"fit"}}}]},"extended_entities":{"media":[{"id_str":"1438654789596942336","indices":[267,290],"media_url":"http://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","media_url_https":"https://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","url":"https://t.co/KZZAK1tXhq","display_url":"pic.twitter.com/KZZAK1tXhq","expanded_url":"https://twitter.com/AlexThomp/status/1438654793384353793/photo/1","type":"photo","original_info":{"width":1170,"height":1809,"focus_rects":[{"x":0,"y":0,"h":655,"w":1170},{"x":0,"y":0,"h":1170,"w":1170},{"x":0,"y":0,"h":1334,"w":1170},{"x":45,"y":0,"h":1809,"w":905},{"x":0,"y":0,"h":1809,"w":1170}]},"sizes":{"small":{"w":440,"h":680,"resize":"fit"},"medium":{"w":776,"h":1200,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1170,"h":1809,"resize":"fit"}},"media_key":"3_1438654789596942336","ext_alt_text":null,"ext_media_availability":{"status":"available"},"ext_media_color":{"palette":[{"rgb":{"red":252,"green":252,"blue":252},"percentage":99.67},{"rgb":{"red":145,"green":145,"blue":145},"percentage":0.33}]},"ext":{"mediaStats":{"r":"Missing","ttl":-1}}}]},"source":"Twitter for iPhone","user_id_str":"370982639","retweet_count":407,"favorite_count":1802,"reply_count":2642,"quote_count":884,"conversation_id_str":"1438654793384353793","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1438654793384353793"}} diff --git a/scraper/test_responses/tweet_with_url_card.json b/scraper/test_responses/tweet_with_url_card.json new file mode 100644 index 0000000..addb4f7 --- /dev/null +++ b/scraper/test_responses/tweet_with_url_card.json @@ -0,0 +1 @@ +{"created_at":"Mon Aug 30 22:48:51 +0000 2021","id_str":"1432475431525969920","full_text":"\"It's OK that our babies may not have learned all their times tables. They know the difference between a riot and a protest. They know the words insurrection and coup.\" - LA Teacher’s Union \n\nhttps://t.co/Y1lWjNEiPK","display_text_range":[0,215],"entities":{"urls":[{"url":"https://t.co/Y1lWjNEiPK","expanded_url":"https://reason.com/2021/08/30/la-teachers-union-cecily-myart-cruz-learning-loss/","display_url":"reason.com/2021/08/30/la-…","indices":[192,215]}]},"source":"Twitter for iPhone","user_id_str":"358545917","retweet_count":146,"favorite_count":557,"reply_count":64,"quote_count":25,"conversation_id_str":"1432475431525969920","possibly_sensitive_editable":true,"card":{"name":"summary_large_image","url":"https://t.co/Y1lWjNEiPK","card_type_url":"http://card-type-url-is-deprecated.invalid","binding_values":{"vanity_url":{"type":"STRING","string_value":"reason.com","scribe_key":"vanity_url"},"amp":{"type":"BOOLEAN","boolean_value":true},"domain":{"type":"STRING","string_value":"reason.com"},"creator":{"type":"USER","user_value":{"id_str":"155581583","path":[]}},"site":{"type":"USER","user_value":{"id_str":"16467567","path":[]},"scribe_key":"publisher_id"},"title":{"type":"STRING","string_value":"L.A. Teachers Union Leader: 'There's No Such Thing As Learning Loss'"},"description":{"type":"STRING","string_value":"\"It’s OK that our babies may not have learned all their times tables,\" says Cecily Myart-Cruz. \"They learned resilience.\""},"thumbnail_image_small":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=144x144","width":144,"height":76}},"thumbnail_image":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=280x150","width":280,"height":147}},"thumbnail_image_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x600","width":600,"height":315}},"thumbnail_image_x_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=png&name=2048x2048_2_exp","width":1200,"height":630}},"thumbnail_image_original":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=orig","width":1200,"height":630}},"summary_photo_image_small":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=386x202","width":386,"height":202}},"summary_photo_image":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x314","width":600,"height":314}},"summary_photo_image_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=800x419","width":800,"height":419}},"summary_photo_image_x_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=png&name=2048x2048_2_exp","width":1200,"height":630}},"summary_photo_image_original":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=orig","width":1200,"height":630}},"photo_image_full_size_small":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=386x202","width":386,"height":202}},"photo_image_full_size":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x314","width":600,"height":314}},"photo_image_full_size_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=800x419","width":800,"height":419}},"photo_image_full_size_x_large":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=png&name=2048x2048_2_exp","width":1200,"height":630}},"photo_image_full_size_original":{"type":"IMAGE","image_value":{"url":"https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=orig","width":1200,"height":630}},"card_url":{"type":"STRING","string_value":"https://t.co/Y1lWjNEiPK","scribe_key":"card_url"}},"users":{"155581583":{"id_str":"155581583","name":"Robby Soave","screen_name":"robbysoave","location":"","description":"@reason senior editor and \"noted scooter anthropologist.\" Author of \"Panic Attack\" and \"Tech Panic.\" Pre-order now: https://t.co/Mb5iBIZhcP","entities":{"description":{"urls":[{"url":"https://t.co/Mb5iBIZhcP","expanded_url":"http://tinyurl.com/rt2zftny","display_url":"tinyurl.com/rt2zftny","indices":[116,139]}]}},"followers_count":81373,"fast_followers_count":0,"normal_followers_count":81373,"friends_count":1807,"listed_count":979,"created_at":"Mon Jun 14 14:45:39 +0000 2010","favourites_count":7920,"geo_enabled":true,"verified":true,"statuses_count":12184,"media_count":1139,"profile_image_url_https":"https://pbs.twimg.com/profile_images/1423673056371871744/NKzapFP-_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/155581583/1628264486","profile_image_extensions_alt_text":null,"profile_image_extensions_media_availability":null,"profile_image_extensions_media_color":{"palette":[{"rgb":{"red":188,"green":188,"blue":187},"percentage":54.64},{"rgb":{"red":40,"green":38,"blue":34},"percentage":19.13},{"rgb":{"red":210,"green":152,"blue":114},"percentage":15.07},{"rgb":{"red":105,"green":68,"blue":43},"percentage":8.59},{"rgb":{"red":182,"green":104,"blue":84},"percentage":0.36}]},"profile_image_extensions":{"mediaStats":{"r":{"missing":null},"ttl":-1}},"profile_banner_extensions_alt_text":null,"profile_banner_extensions_media_availability":null,"profile_banner_extensions_media_color":{"palette":[{"rgb":{"red":155,"green":28,"blue":40},"percentage":80.27},{"rgb":{"red":188,"green":133,"blue":139},"percentage":9.65},{"rgb":{"red":236,"green":44,"blue":58},"percentage":8.05},{"rgb":{"red":208,"green":175,"blue":178},"percentage":1.14},{"rgb":{"red":31,"green":27,"blue":27},"percentage":0.4}]},"profile_banner_extensions":{"mediaStats":{"r":{"missing":null},"ttl":-1}},"profile_link_color":"0084B4","has_extended_profile":true,"pinned_tweet_ids":[1435296810638233602],"pinned_tweet_ids_str":["1435296810638233602"],"has_custom_timelines":true,"advertiser_account_type":"promotable_user","advertiser_account_service_levels":[],"profile_interstitial_type":"","business_profile_state":"none","translator_type":"none","withheld_in_countries":[],"ext":{"highlightedLabel":{"r":{"ok":{}},"ttl":-1}}},"16467567":{"id_str":"16467567","name":"reason","screen_name":"reason","location":"Washington, DC and Los Angeles","description":"Reason is the monthly magazine and website of “free minds and free markets” published by @ReasonFdn.","url":"https://t.co/W2vfk5WIWy","entities":{"url":{"urls":[{"url":"https://t.co/W2vfk5WIWy","expanded_url":"http://reason.com","display_url":"reason.com","indices":[0,23]}]},"description":{}},"followers_count":265717,"fast_followers_count":0,"normal_followers_count":265717,"friends_count":365,"listed_count":6456,"created_at":"Fri Sep 26 13:31:17 +0000 2008","favourites_count":208,"verified":true,"statuses_count":91534,"media_count":2538,"profile_image_url_https":"https://pbs.twimg.com/profile_images/943872166101000192/JIdyYi7P_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/16467567/1629580544","profile_image_extensions_alt_text":null,"profile_image_extensions_media_availability":null,"profile_image_extensions_media_color":{"palette":[{"rgb":{"red":242,"green":107,"blue":52},"percentage":92.62},{"rgb":{"red":255,"green":255,"blue":255},"percentage":7.15},{"rgb":{"red":249,"green":193,"blue":168},"percentage":0.15}]},"profile_image_extensions":{"mediaStats":{"r":{"missing":null},"ttl":-1}},"profile_banner_extensions_alt_text":null,"profile_banner_extensions_media_availability":null,"profile_banner_extensions_media_color":{"palette":[{"rgb":{"red":229,"green":179,"blue":1},"percentage":46.93},{"rgb":{"red":11,"green":9,"blue":4},"percentage":19.06},{"rgb":{"red":241,"green":222,"blue":87},"percentage":8.48},{"rgb":{"red":144,"green":115,"blue":28},"percentage":8.16},{"rgb":{"red":238,"green":151,"blue":3},"percentage":5.56}]},"profile_banner_extensions":{"mediaStats":{"r":{"missing":null},"ttl":-1}},"profile_link_color":"FF6C2F","pinned_tweet_ids":[],"pinned_tweet_ids_str":[],"advertiser_account_type":"promotable_user","advertiser_account_service_levels":["smb","media_studio"],"profile_interstitial_type":"","business_profile_state":"none","translator_type":"none","withheld_in_countries":[],"ext":{"highlightedLabel":{"r":{"ok":{}},"ttl":-1}}}},"card_platform":{"platform":{"device":{"name":"Swift","version":"12"},"audience":{"name":"production"}}}},"lang":"en"} diff --git a/scraper/tweet.go b/scraper/tweet.go index 2a0ec04..20299fd 100644 --- a/scraper/tweet.go +++ b/scraper/tweet.go @@ -23,7 +23,7 @@ type Tweet struct { NumQuoteTweets int InReplyTo TweetID - Urls []string + Urls []Url Images []Image Videos []Video Mentions []UserHandle @@ -63,7 +63,7 @@ Replies: %d RT: %d QT: %d Likes: %d if len(t.Urls) > 0 { ret += "urls: [\n" for _, url := range(t.Urls) { - ret += " " + url + "\n" + ret += " " + url.Text + "\n" } ret += "]" } @@ -89,8 +89,18 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { ret.NumQuoteTweets = apiTweet.QuoteCount ret.InReplyTo = TweetID(apiTweet.InReplyToStatusID) - for _, url := range apiTweet.Entities.URLs { - ret.Urls = append(ret.Urls, url.ExpandedURL) + for i, url := range apiTweet.Entities.URLs { + if i != 0 { + panic(fmt.Sprintf("Tweet with multiple embedded URLs: %d", apiTweet.ID)) + } + var url_object Url + if apiTweet.Card.BindingValues.Domain.Value != "" { + // Using the "Domain" field to detect if there is a card + url_object = ParseAPIUrlCard(apiTweet.Card) + } + url_object.Text = url.ExpandedURL + url_object.TweetID = ret.ID + ret.Urls = append(ret.Urls, url_object) } for _, media := range apiTweet.Entities.Media { if media.Type != "photo" { // TODO: remove this eventually diff --git a/scraper/tweet_test.go b/scraper/tweet_test.go index 834d8fc..4120ae5 100644 --- a/scraper/tweet_test.go +++ b/scraper/tweet_test.go @@ -52,6 +52,10 @@ func TestParseSingleTweet(t *testing.T) { t.Errorf("Expected %v, got %v", []string{"michaelmalice"}, tweet.Mentions) } + if len(tweet.Urls) != 0 { + t.Errorf("Expected %d urls, but got %d", 0, len(tweet.Urls)) + } + if tweet.PostedAt.Unix() != 1621639105 { t.Errorf("Expected %d, got %d", 1621639105, tweet.PostedAt.Unix()) } @@ -162,6 +166,66 @@ func TestParseTweetWithVideo(t *testing.T) { } } +func TestParseTweetWithUrl(t *testing.T) { + data, err := ioutil.ReadFile("test_responses/tweet_with_url_card.json") + if err != nil { + panic(err) + } + var apitweet scraper.APITweet + err = json.Unmarshal(data, &apitweet) + if err != nil { + t.Errorf(err.Error()) + } + tweet, err := scraper.ParseSingleTweet(apitweet) + if err != nil { + t.Errorf(err.Error()) + } + + if len(tweet.Urls) != 1 { + t.Errorf("Expected %d urls, but got %d", 1, len(tweet.Urls)) + } + + expected_url_text := "https://reason.com/2021/08/30/la-teachers-union-cecily-myart-cruz-learning-loss/" + if tweet.Urls[0].Text != expected_url_text { + t.Errorf("Expected Url text to be %q, but got %q", expected_url_text, tweet.Urls[0].Text) + } + if !tweet.Urls[0].HasCard { + t.Errorf("Expected it to have a card, but it doesn't") + } + expected_url_domain := "reason.com" + if tweet.Urls[0].Domain != expected_url_domain { + t.Errorf("Expected Url text to be %q, but got %q", expected_url_domain, tweet.Urls[0].Domain) + } +} + +func TestParseTweetWithUrlButNoCard(t *testing.T) { + data, err := ioutil.ReadFile("test_responses/tweet_with_url_but_no_card.json") + if err != nil { + panic(err) + } + var apitweet scraper.APITweet + err = json.Unmarshal(data, &apitweet) + if err != nil { + t.Errorf(err.Error()) + } + tweet, err := scraper.ParseSingleTweet(apitweet) + if err != nil { + t.Errorf(err.Error()) + } + + if len(tweet.Urls) != 1 { + t.Errorf("Expected %d urls, but got %d", 1, len(tweet.Urls)) + } + + expected_url_text := "https://www.politico.com/newsletters/west-wing-playbook/2021/09/16/the-jennifer-rubin-wh-symbiosis-494364" + if tweet.Urls[0].Text != expected_url_text { + t.Errorf("Expected Url text to be %q, but got %q", expected_url_text, tweet.Urls[0].Text) + } + if tweet.Urls[0].HasCard { + t.Errorf("Expected url not to have a card, but it thinks it has one") + } +} + func TestParseTweetResponse(t *testing.T) { data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json") if err != nil { diff --git a/scraper/url.go b/scraper/url.go index ffe1b3a..3c5f9e6 100644 --- a/scraper/url.go +++ b/scraper/url.go @@ -7,6 +7,8 @@ import ( ) type Url struct { + TweetID TweetID + Domain string Text string Title string