Add persistence for new Url type

2021-09-17 18:04:12 -07:00 · 2021-09-17 18:04:12 -07:00 · 05c3f2289b
commit 05c3f2289b
parent 79f098450e
12 changed files with 261 additions and 38 deletions
--- a/cmd/tests.sh
+++ b/cmd/tests.sh
@ -114,4 +114,20 @@ tw fetch_user HbdNrx
 test $(sqlite3 twitter.db "select is_private from users where handle = 'HbdNrx'") = "1"
 # Test tweets with URLs
 urls_count=$(sqlite3 twitter.db "select count(*) from urls")
 tw fetch_tweet https://twitter.com/CovfefeAnon/status/1428904664645394433
 urls_count_after=$(sqlite3 twitter.db "select count(*) from urls")
 test $urls_count_after = $(($urls_count + 1))
 test "$(sqlite3 twitter.db "select title from urls where tweet_id = 1428904664645394433")" = "Justice Department investigating Elon Musk's SpaceX following complaint of hiring discrimination"
 test $(sqlite3 twitter.db "select thumbnail_remote_url from urls where tweet_id = 1428904664645394433") = "https://pbs.twimg.com/card_img/1436430370946392064/WX1Rv2AJ?format=jpg&name=800x320_1"
 # Try to double-fetch it; shouldn't duplicate the URL
 tw fetch_tweet https://twitter.com/CovfefeAnon/status/1428904664645394433
 urls_count_after_2x=$(sqlite3 twitter.db "select count(*) from urls")
 test $urls_count_after_2x = $urls_count_after
 # TODO: Maybe this file should be broken up into multiple test scripts
 echo -e "\033[32mAll tests passed.  Finished successfully.\033[0m"
--- a/persistence/media_queries.go
+++ b/persistence/media_queries.go
@ -42,6 +42,22 @@ func (p Profile) SaveVideo(vid scraper.Video) error {
    return err
 }
 /**
 * Save an Url
 */
 func (p Profile) SaveUrl(url scraper.Url) error {
    _, err := p.DB.Exec(`
        insert into urls (tweet_id, domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded)
                  values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
             on conflict do update
                     set is_content_downloaded=?
        `,
        url.TweetID, url.Domain, url.Text, url.Title, url.Description,  url.CreatorID, url.SiteID, url.ThumbnailRemoteUrl, url.ThumbnailLocalPath, url.HasCard, url.IsContentDownloaded,
        url.IsContentDownloaded,
    )
    return err
 }
 /**
 * Get the list of images for a tweet
 */
@ -93,3 +109,28 @@ func (p Profile) GetVideosForTweet(t scraper.Tweet) (vids []scraper.Video, err e
    }
    return
 }
 /**
 * Get the list of Urls for a Tweet
 */
 func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error) {
    stmt, err := p.DB.Prepare("select domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded from urls where tweet_id=? order by rowid")
    if err != nil {
        return
    }
    defer stmt.Close()
    rows, err := stmt.Query(t.ID)
    if err != nil {
        return
    }
    var url scraper.Url
    for rows.Next() {
        err = rows.Scan(&url.Domain, &url.Text, &url.Title, &url.Description, &url.CreatorID, &url.SiteID, &url.ThumbnailRemoteUrl, &url.ThumbnailLocalPath, &url.HasCard, &url.IsContentDownloaded)
        if err != nil {
            return
        }
        url.TweetID = t.ID
        urls = append(urls, url)
    }
    return
 }
--- a/persistence/media_queries_test.go
+++ b/persistence/media_queries_test.go
@ -165,3 +165,81 @@ func TestModifyVideo(t *testing.T) {
        t.Error(diff)
    }
 }
 /**
 * Create an Url, save it, reload it, and make sure it comes back the same
 */
 func TestSaveAndLoadUrl(t *testing.T) {
    profile_path := "test_profiles/TestMediaQueries"
    profile := create_or_load_profile(profile_path)
    tweet := create_stable_tweet()
    // Create a fresh Url to test on
    rand.Seed(time.Now().UnixNano())
    url := create_url_from_id(rand.Int())
    url.TweetID = tweet.ID
    // Save the Url
    err := profile.SaveUrl(url)
    if err != nil {
        t.Fatalf("Failed to save the url: %s", err.Error())
    }
    // Reload the Url
    urls, err := profile.GetUrlsForTweet(tweet)
    if err != nil {
        t.Fatalf("Could not load urls: %s", err.Error())
    }
    var new_url scraper.Url
    for index := range urls {
        if urls[index].Text == url.Text {
            new_url = urls[index]
        }
    }
    if new_url.Text != url.Text {
        t.Fatalf("Could not find url for some reason: %s, %s; %+v", new_url.Text, url.Text, urls)
    }
    if diff := deep.Equal(url, new_url); diff != nil {
        t.Error(diff)
    }
 }
 /**
 * Change an Url, save the changes, reload it, and check if it comes back the same
 */
 func TestModifyUrl(t *testing.T) {
    profile_path := "test_profiles/TestMediaQueries"
    profile := create_or_load_profile(profile_path)
    tweet := create_stable_tweet()
    url := tweet.Urls[0]
    if url.Text != "-1text" {
        t.Fatalf("Got the wrong url back: wanted %s, got %s!", "-1text", url.Text)
    }
    url.IsContentDownloaded = true
    // Save the changes
    err := profile.SaveUrl(url)
    if err != nil {
        t.Error(err)
    }
    // Reload it
    urls, err := profile.GetUrlsForTweet(tweet)
    if err != nil {
        t.Fatalf("Could not load urls: %s", err.Error())
    }
    new_url := urls[0]
    if new_url.Text != "-1text" {
        t.Fatalf("Got the wrong url back: wanted %s, got %s!", "-1text", new_url.Text)
    }
    if diff := deep.Equal(url, new_url); diff != nil {
        t.Error(diff)
    }
 }
--- a/persistence/schema.sql
+++ b/persistence/schema.sql
@ -54,10 +54,21 @@ create table retweets(rowid integer primary key,
 create table urls (rowid integer primary key,
    tweet_id integer not null,
    domain text,
    text text not null,
    title text,
    description text,
    creator_id integer,
    site_id integer,
    thumbnail_remote_url text,
    thumbnail_local_path text,
    has_card boolean,
    is_content_downloaded boolean default 0,
    unique (tweet_id, text)
    foreign key(tweet_id) references tweets(id)
    -- foreign key(creator_id) references users(id)
    -- foreign key(site_id) references users(id)
 );
 create table images (rowid integer primary key,
--- a/persistence/tweet_queries.go
+++ b/persistence/tweet_queries.go
@ -1,7 +1,6 @@
 package persistence
 import (
    "fmt"
    "time"
    "strings"
    "database/sql"
@ -34,7 +33,7 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
        return err
    }
    for _, url := range t.Urls {
-        _, err := db.Exec("insert into urls (tweet_id, text) values (?, ?) on conflict do nothing", t.ID, url)
+        err := p.SaveUrl(url)
        if err != nil {
            return err
        }
@ -80,29 +79,6 @@ func (p Profile) IsTweetInDatabase(id scraper.TweetID) bool {
    return true
 }
 func (p Profile) attach_urls(t *scraper.Tweet) error {
    println("Attaching urls")
    stmt, err := p.DB.Prepare("select text from urls where tweet_id = ?")
    if err != nil {
        return err
    }
    defer stmt.Close()
    rows, err := stmt.Query(t.ID)
    if err != nil {
        return err
    }
    var url string
    for rows.Next() {
        err = rows.Scan(&url)
        if err != nil {
            return err
        }
        t.Urls = append(t.Urls, url)
        fmt.Printf("%v\n", t.Urls)
    }
    return nil
 }
 func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
    db := p.DB
@ -146,7 +122,9 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
    }
    t.Videos = vids
-    err = p.attach_urls(&t)
+    urls, err := p.GetUrlsForTweet(t)
    t.Urls = urls
    return t, err
 }
--- a/persistence/tweet_queries_test.go
+++ b/persistence/tweet_queries_test.go
@ -29,12 +29,6 @@ func TestSaveAndLoadTweet(t *testing.T) {
        t.Fatalf("Failed to load the tweet: %s", err.Error())
    }
    if diff := deep.Equal(tweet.Images, new_tweet.Images); diff != nil {
        t.Error(diff)
    }
    if diff := deep.Equal(tweet.Videos, new_tweet.Videos); diff != nil {
        t.Error(diff)
    }
    if diff := deep.Equal(tweet, new_tweet); diff != nil {
        t.Error(diff)
    }
--- a/persistence/utils_test.go
+++ b/persistence/utils_test.go
@ -92,6 +92,26 @@ func create_video_from_id(id int) scraper.Video {
 	}
 }
 /**
 * Create a semi-stable Url based on the given ID
 */
 func create_url_from_id(id int) scraper.Url {
 	s := fmt.Sprint(id)
 	return scraper.Url {
 		TweetID: -1,
 		Domain: s + "domain",
 		Text: s + "text",
 		Title: s + "title",
 		Description: s + "description",
 		ThumbnailRemoteUrl: s + "remote url",
 		ThumbnailLocalPath: s + "local path",
 		CreatorID: scraper.UserID(id),
 		SiteID: scraper.UserID(id),
 		HasCard: true,
 		IsContentDownloaded: false,
 	}
 }
 /**
 * Create a stable tweet with a fixed ID and content
 */
@ -109,7 +129,9 @@ func create_stable_tweet() scraper.Tweet {
 		Videos: []scraper.Video{
 			create_video_from_id(-1),
 		},
-		Urls: []string{},
+		Urls: []scraper.Url{
 			create_url_from_id(-1),
 		},
 		Images: []scraper.Image{
 			create_image_from_id(-1),
 		},
@ -173,6 +195,11 @@ func create_dummy_tweet() scraper.Tweet {
 	vid := create_video_from_id(rand.Int())
 	vid.TweetID = tweet_id
 	url1 := create_url_from_id(rand.Int())
 	url1.TweetID = tweet_id
 	url2 := create_url_from_id(rand.Int())
 	url2.TweetID = tweet_id
 	return scraper.Tweet{
 		ID: tweet_id,
 		UserID: -1,
@ -183,7 +210,7 @@ func create_dummy_tweet() scraper.Tweet {
 		NumReplies: 3,
 		NumQuoteTweets: 4,
 		Videos: []scraper.Video{vid},
-		Urls: []string{"url1", "url2"},
+		Urls: []scraper.Url{url1, url2},
 		Images: []scraper.Image{img1, img2},
 		Mentions: []scraper.UserHandle{"mention1", "mention2"},
 		Hashtags: []string{"hash1", "hash2"},
--- a/scraper/test_responses/tweet_with_url_but_no_card.json
+++ b/scraper/test_responses/tweet_with_url_but_no_card.json
@ -0,0 +1 @@
 {"created_at":"Fri Sep 17 00:03:26 +0000 2021","id_str":"1438654793384353793","full_text":"NEW: columnist Jennifer Rubin was one of the Obama administration’s most reactionary critics\n\nNow she’s the Biden admin’s favorite columnist\nW/ @NickNiedz\n\nWe reached out to Rubin her columns and divisions at Wapo over them\n\nHer response—&gt;\nhttps://t.co/ZigZyLctwt https://t.co/KZZAK1tXhq","display_text_range":[0,266],"entities":{"user_mentions":[{"screen_name":"NickNiedz","name":"Nick Niedzwiadek","id_str":"548501303","indices":[144,154]}],"urls":[{"url":"https://t.co/ZigZyLctwt","expanded_url":"https://www.politico.com/newsletters/west-wing-playbook/2021/09/16/the-jennifer-rubin-wh-symbiosis-494364","display_url":"politico.com/newsletters/we…","indices":[243,266]}],"media":[{"id_str":"1438654789596942336","indices":[267,290],"media_url":"http://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","media_url_https":"https://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","url":"https://t.co/KZZAK1tXhq","display_url":"pic.twitter.com/KZZAK1tXhq","expanded_url":"https://twitter.com/AlexThomp/status/1438654793384353793/photo/1","type":"photo","original_info":{"width":1170,"height":1809,"focus_rects":[{"x":0,"y":0,"h":655,"w":1170},{"x":0,"y":0,"h":1170,"w":1170},{"x":0,"y":0,"h":1334,"w":1170},{"x":45,"y":0,"h":1809,"w":905},{"x":0,"y":0,"h":1809,"w":1170}]},"sizes":{"small":{"w":440,"h":680,"resize":"fit"},"medium":{"w":776,"h":1200,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1170,"h":1809,"resize":"fit"}}}]},"extended_entities":{"media":[{"id_str":"1438654789596942336","indices":[267,290],"media_url":"http://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","media_url_https":"https://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","url":"https://t.co/KZZAK1tXhq","display_url":"pic.twitter.com/KZZAK1tXhq","expanded_url":"https://twitter.com/AlexThomp/status/1438654793384353793/photo/1","type":"photo","original_info":{"width":1170,"height":1809,"focus_rects":[{"x":0,"y":0,"h":655,"w":1170},{"x":0,"y":0,"h":1170,"w":1170},{"x":0,"y":0,"h":1334,"w":1170},{"x":45,"y":0,"h":1809,"w":905},{"x":0,"y":0,"h":1809,"w":1170}]},"sizes":{"small":{"w":440,"h":680,"resize":"fit"},"medium":{"w":776,"h":1200,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1170,"h":1809,"resize":"fit"}},"media_key":"3_1438654789596942336","ext_alt_text":null,"ext_media_availability":{"status":"available"},"ext_media_color":{"palette":[{"rgb":{"red":252,"green":252,"blue":252},"percentage":99.67},{"rgb":{"red":145,"green":145,"blue":145},"percentage":0.33}]},"ext":{"mediaStats":{"r":"Missing","ttl":-1}}}]},"source":"<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>","user_id_str":"370982639","retweet_count":407,"favorite_count":1802,"reply_count":2642,"quote_count":884,"conversation_id_str":"1438654793384353793","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1438654793384353793"}}
--- a/scraper/test_responses/tweet_with_url_card.json
+++ b/scraper/test_responses/tweet_with_url_card.json
--- a/scraper/tweet.go
+++ b/scraper/tweet.go
@ -23,7 +23,7 @@ type Tweet struct {
 	NumQuoteTweets int
 	InReplyTo      TweetID
-	Urls        []string
+	Urls        []Url
 	Images      []Image
 	Videos      []Video
 	Mentions    []UserHandle
@ -63,7 +63,7 @@ Replies: %d      RT: %d      QT: %d      Likes: %d
 	if len(t.Urls) > 0 {
 		ret += "urls: [\n"
 		for _, url := range(t.Urls) {
-			ret += "  " + url + "\n"
+			ret += "  " + url.Text + "\n"
 		}
 		ret += "]"
 	}
@ -89,8 +89,18 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
 	ret.NumQuoteTweets = apiTweet.QuoteCount
 	ret.InReplyTo = TweetID(apiTweet.InReplyToStatusID)
-	for _, url := range apiTweet.Entities.URLs {
+	for i, url := range apiTweet.Entities.URLs {
-		ret.Urls = append(ret.Urls, url.ExpandedURL)
+		if i != 0 {
 			panic(fmt.Sprintf("Tweet with multiple embedded URLs: %d", apiTweet.ID))
 		}
 		var url_object Url
 		if apiTweet.Card.BindingValues.Domain.Value != "" {
 			// Using the "Domain" field to detect if there is a card
 			url_object = ParseAPIUrlCard(apiTweet.Card)
 		}
 		url_object.Text = url.ExpandedURL
 		url_object.TweetID = ret.ID
 		ret.Urls = append(ret.Urls, url_object)
 	}
 	for _, media := range apiTweet.Entities.Media {
 		if media.Type != "photo" {  // TODO: remove this eventually
--- a/scraper/tweet_test.go
+++ b/scraper/tweet_test.go
@ -52,6 +52,10 @@ func TestParseSingleTweet(t *testing.T) {
 		t.Errorf("Expected %v, got %v", []string{"michaelmalice"}, tweet.Mentions)
 	}
 	if len(tweet.Urls) != 0 {
 		t.Errorf("Expected %d urls, but got %d", 0, len(tweet.Urls))
 	}
 	if tweet.PostedAt.Unix() != 1621639105 {
 		t.Errorf("Expected %d, got %d", 1621639105, tweet.PostedAt.Unix())
 	}
@ -162,6 +166,66 @@ func TestParseTweetWithVideo(t *testing.T) {
 	}
 }
 func TestParseTweetWithUrl(t *testing.T) {
 	data, err := ioutil.ReadFile("test_responses/tweet_with_url_card.json")
 	if err != nil {
 		panic(err)
 	}
 	var apitweet scraper.APITweet
 	err = json.Unmarshal(data, &apitweet)
 	if err != nil {
 		t.Errorf(err.Error())
 	}
 	tweet, err := scraper.ParseSingleTweet(apitweet)
 	if err != nil {
 		t.Errorf(err.Error())
 	}
 	if len(tweet.Urls) != 1 {
 		t.Errorf("Expected %d urls, but got %d", 1, len(tweet.Urls))
 	}
 	expected_url_text := "https://reason.com/2021/08/30/la-teachers-union-cecily-myart-cruz-learning-loss/"
 	if tweet.Urls[0].Text != expected_url_text {
 		t.Errorf("Expected Url text to be %q, but got %q", expected_url_text, tweet.Urls[0].Text)
 	}
 	if !tweet.Urls[0].HasCard {
 		t.Errorf("Expected it to have a card, but it doesn't")
 	}
 	expected_url_domain := "reason.com"
 	if tweet.Urls[0].Domain != expected_url_domain {
 		t.Errorf("Expected Url text to be %q, but got %q", expected_url_domain, tweet.Urls[0].Domain)
 	}
 }
 func TestParseTweetWithUrlButNoCard(t *testing.T) {
 	data, err := ioutil.ReadFile("test_responses/tweet_with_url_but_no_card.json")
 	if err != nil {
 		panic(err)
 	}
 	var apitweet scraper.APITweet
 	err = json.Unmarshal(data, &apitweet)
 	if err != nil {
 		t.Errorf(err.Error())
 	}
 	tweet, err := scraper.ParseSingleTweet(apitweet)
 	if err != nil {
 		t.Errorf(err.Error())
 	}
 	if len(tweet.Urls) != 1 {
 		t.Errorf("Expected %d urls, but got %d", 1, len(tweet.Urls))
 	}
 	expected_url_text := "https://www.politico.com/newsletters/west-wing-playbook/2021/09/16/the-jennifer-rubin-wh-symbiosis-494364"
 	if tweet.Urls[0].Text != expected_url_text {
 		t.Errorf("Expected Url text to be %q, but got %q", expected_url_text, tweet.Urls[0].Text)
 	}
 	if tweet.Urls[0].HasCard {
 		t.Errorf("Expected url not to have a card, but it thinks it has one")
 	}
 }
 func TestParseTweetResponse(t *testing.T) {
 	data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")
 	if err != nil {
--- a/scraper/url.go
+++ b/scraper/url.go
@ -7,6 +7,8 @@ import (
 )
 type Url struct {
 	TweetID TweetID
 	Domain string
 	Text string
 	Title string
		`@ -0,0 +1 @@`
							{"created_at":"Fri Sep 17 00:03:26 +0000 2021","id_str":"1438654793384353793","full_text":"NEW: columnist Jennifer Rubin was one of the Obama administration’s most reactionary critics\n\nNow she’s the Biden admin’s favorite columnist\nW/ @NickNiedz\n\nWe reached out to Rubin her columns and divisions at Wapo over them\n\nHer response—>\nhttps://t.co/ZigZyLctwt https://t.co/KZZAK1tXhq","display_text_range":[0,266],"entities":{"user_mentions":[{"screen_name":"NickNiedz","name":"Nick Niedzwiadek","id_str":"548501303","indices":[144,154]}],"urls":[{"url":"https://t.co/ZigZyLctwt","expanded_url":"https://www.politico.com/newsletters/west-wing-playbook/2021/09/16/the-jennifer-rubin-wh-symbiosis-494364","display_url":"politico.com/newsletters/we…","indices":[243,266]}],"media":[{"id_str":"1438654789596942336","indices":[267,290],"media_url":"http://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","media_url_https":"https://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","url":"https://t.co/KZZAK1tXhq","display_url":"pic.twitter.com/KZZAK1tXhq","expanded_url":"https://twitter.com/AlexThomp/status/1438654793384353793/photo/1","type":"photo","original_info":{"width":1170,"height":1809,"focus_rects":[{"x":0,"y":0,"h":655,"w":1170},{"x":0,"y":0,"h":1170,"w":1170},{"x":0,"y":0,"h":1334,"w":1170},{"x":45,"y":0,"h":1809,"w":905},{"x":0,"y":0,"h":1809,"w":1170}]},"sizes":{"small":{"w":440,"h":680,"resize":"fit"},"medium":{"w":776,"h":1200,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1170,"h":1809,"resize":"fit"}}}]},"extended_entities":{"media":[{"id_str":"1438654789596942336","indices":[267,290],"media_url":"http://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","media_url_https":"https://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","url":"https://t.co/KZZAK1tXhq","display_url":"pic.twitter.com/KZZAK1tXhq","expanded_url":"https://twitter.com/AlexThomp/status/1438654793384353793/photo/1","type":"photo","original_info":{"width":1170,"height":1809,"focus_rects":[{"x":0,"y":0,"h":655,"w":1170},{"x":0,"y":0,"h":1170,"w":1170},{"x":0,"y":0,"h":1334,"w":1170},{"x":45,"y":0,"h":1809,"w":905},{"x":0,"y":0,"h":1809,"w":1170}]},"sizes":{"small":{"w":440,"h":680,"resize":"fit"},"medium":{"w":776,"h":1200,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1170,"h":1809,"resize":"fit"}},"media_key":"3_1438654789596942336","ext_alt_text":null,"ext_media_availability":{"status":"available"},"ext_media_color":{"palette":[{"rgb":{"red":252,"green":252,"blue":252},"percentage":99.67},{"rgb":{"red":145,"green":145,"blue":145},"percentage":0.33}]},"ext":{"mediaStats":{"r":"Missing","ttl":-1}}}]},"source":"<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>","user_id_str":"370982639","retweet_count":407,"favorite_count":1802,"reply_count":2642,"quote_count":884,"conversation_id_str":"1438654793384353793","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1438654793384353793"}}