Add persistence for new Url type

2021-09-17 18:04:12 -07:00 · 2021-09-17 18:04:12 -07:00 · 05c3f2289b
commit 05c3f2289b
parent 79f098450e
12 changed files with 261 additions and 38 deletions
--- a/cmd/tests.sh
+++ b/cmd/tests.sh
@ -114,4 +114,20 @@ tw fetch_user HbdNrx
 test $(sqlite3 twitter.db "select is_private from users where handle = 'HbdNrx'") = "1"


+# Test tweets with URLs
+urls_count=$(sqlite3 twitter.db "select count(*) from urls")
+tw fetch_tweet https://twitter.com/CovfefeAnon/status/1428904664645394433
+urls_count_after=$(sqlite3 twitter.db "select count(*) from urls")
+test $urls_count_after = $(($urls_count + 1))
+test "$(sqlite3 twitter.db "select title from urls where tweet_id = 1428904664645394433")" = "Justice Department investigating Elon Musk's SpaceX following complaint of hiring discrimination"
+test $(sqlite3 twitter.db "select thumbnail_remote_url from urls where tweet_id = 1428904664645394433") = "https://pbs.twimg.com/card_img/1436430370946392064/WX1Rv2AJ?format=jpg&name=800x320_1"
+
+# Try to double-fetch it; shouldn't duplicate the URL
+tw fetch_tweet https://twitter.com/CovfefeAnon/status/1428904664645394433
+urls_count_after_2x=$(sqlite3 twitter.db "select count(*) from urls")
+test $urls_count_after_2x = $urls_count_after
+
+
+# TODO: Maybe this file should be broken up into multiple test scripts
+
 echo -e "\033[32mAll tests passed.  Finished successfully.\033[0m"
--- a/persistence/media_queries.go
+++ b/persistence/media_queries.go
@ -42,6 +42,22 @@ func (p Profile) SaveVideo(vid scraper.Video) error {
    return err
 }

+/**
+ * Save an Url
+ */
+func (p Profile) SaveUrl(url scraper.Url) error {
+    _, err := p.DB.Exec(`
+        insert into urls (tweet_id, domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded)
+                  values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+             on conflict do update
+                     set is_content_downloaded=?
+        `,
+        url.TweetID, url.Domain, url.Text, url.Title, url.Description,  url.CreatorID, url.SiteID, url.ThumbnailRemoteUrl, url.ThumbnailLocalPath, url.HasCard, url.IsContentDownloaded,
+        url.IsContentDownloaded,
+    )
+    return err
+}
+
 /**
 * Get the list of images for a tweet
 */
@ -93,3 +109,28 @@ func (p Profile) GetVideosForTweet(t scraper.Tweet) (vids []scraper.Video, err e
    }
    return
 }
+
+/**
+ * Get the list of Urls for a Tweet
+ */
+func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error) {
+    stmt, err := p.DB.Prepare("select domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded from urls where tweet_id=? order by rowid")
+    if err != nil {
+        return
+    }
+    defer stmt.Close()
+    rows, err := stmt.Query(t.ID)
+    if err != nil {
+        return
+    }
+    var url scraper.Url
+    for rows.Next() {
+        err = rows.Scan(&url.Domain, &url.Text, &url.Title, &url.Description, &url.CreatorID, &url.SiteID, &url.ThumbnailRemoteUrl, &url.ThumbnailLocalPath, &url.HasCard, &url.IsContentDownloaded)
+        if err != nil {
+            return
+        }
+        url.TweetID = t.ID
+        urls = append(urls, url)
+    }
+    return
+}
--- a/persistence/media_queries_test.go
+++ b/persistence/media_queries_test.go
@ -165,3 +165,81 @@ func TestModifyVideo(t *testing.T) {
        t.Error(diff)
    }
 }
+
+
+/**
+ * Create an Url, save it, reload it, and make sure it comes back the same
+ */
+func TestSaveAndLoadUrl(t *testing.T) {
+    profile_path := "test_profiles/TestMediaQueries"
+    profile := create_or_load_profile(profile_path)
+
+    tweet := create_stable_tweet()
+
+    // Create a fresh Url to test on
+    rand.Seed(time.Now().UnixNano())
+    url := create_url_from_id(rand.Int())
+    url.TweetID = tweet.ID
+
+    // Save the Url
+    err := profile.SaveUrl(url)
+    if err != nil {
+        t.Fatalf("Failed to save the url: %s", err.Error())
+    }
+
+    // Reload the Url
+    urls, err := profile.GetUrlsForTweet(tweet)
+    if err != nil {
+        t.Fatalf("Could not load urls: %s", err.Error())
+    }
+
+    var new_url scraper.Url
+    for index := range urls {
+        if urls[index].Text == url.Text {
+            new_url = urls[index]
+        }
+    }
+    if new_url.Text != url.Text {
+        t.Fatalf("Could not find url for some reason: %s, %s; %+v", new_url.Text, url.Text, urls)
+    }
+    if diff := deep.Equal(url, new_url); diff != nil {
+        t.Error(diff)
+    }
+}
+
+/**
+ * Change an Url, save the changes, reload it, and check if it comes back the same
+ */
+func TestModifyUrl(t *testing.T) {
+    profile_path := "test_profiles/TestMediaQueries"
+    profile := create_or_load_profile(profile_path)
+
+    tweet := create_stable_tweet()
+    url := tweet.Urls[0]
+
+    if url.Text != "-1text" {
+        t.Fatalf("Got the wrong url back: wanted %s, got %s!", "-1text", url.Text)
+    }
+
+    url.IsContentDownloaded = true
+
+    // Save the changes
+    err := profile.SaveUrl(url)
+    if err != nil {
+        t.Error(err)
+    }
+
+    // Reload it
+    urls, err := profile.GetUrlsForTweet(tweet)
+    if err != nil {
+        t.Fatalf("Could not load urls: %s", err.Error())
+    }
+    new_url := urls[0]
+    if new_url.Text != "-1text" {
+        t.Fatalf("Got the wrong url back: wanted %s, got %s!", "-1text", new_url.Text)
+    }
+
+    if diff := deep.Equal(url, new_url); diff != nil {
+        t.Error(diff)
+    }
+}
--- a/persistence/schema.sql
+++ b/persistence/schema.sql
@ -54,10 +54,21 @@ create table retweets(rowid integer primary key,

 create table urls (rowid integer primary key,
    tweet_id integer not null,
+    domain text,
    text text not null,
+    title text,
+    description text,
+    creator_id integer,
+    site_id integer,
+    thumbnail_remote_url text,
+    thumbnail_local_path text,
+    has_card boolean,
+    is_content_downloaded boolean default 0,

    unique (tweet_id, text)
    foreign key(tweet_id) references tweets(id)
+    -- foreign key(creator_id) references users(id)
+    -- foreign key(site_id) references users(id)
 );

 create table images (rowid integer primary key,
--- a/persistence/tweet_queries.go
+++ b/persistence/tweet_queries.go
@ -1,7 +1,6 @@
 package persistence

 import (
-    "fmt"
    "time"
    "strings"
    "database/sql"
@ -34,7 +33,7 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
        return err
    }
    for _, url := range t.Urls {
-        _, err := db.Exec("insert into urls (tweet_id, text) values (?, ?) on conflict do nothing", t.ID, url)
+        err := p.SaveUrl(url)
        if err != nil {
            return err
        }
@ -80,29 +79,6 @@ func (p Profile) IsTweetInDatabase(id scraper.TweetID) bool {
    return true
 }

-func (p Profile) attach_urls(t *scraper.Tweet) error {
-    println("Attaching urls")
-    stmt, err := p.DB.Prepare("select text from urls where tweet_id = ?")
-    if err != nil {
-        return err
-    }
-    defer stmt.Close()
-    rows, err := stmt.Query(t.ID)
-    if err != nil {
-        return err
-    }
-    var url string
-    for rows.Next() {
-        err = rows.Scan(&url)
-        if err != nil {
-            return err
-        }
-        t.Urls = append(t.Urls, url)
-        fmt.Printf("%v\n", t.Urls)
-    }
-    return nil
-}
-
 func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
    db := p.DB

@ -146,7 +122,9 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
    }
    t.Videos = vids

-    err = p.attach_urls(&t)
+    urls, err := p.GetUrlsForTweet(t)
+    t.Urls = urls
+
    return t, err
 }

--- a/persistence/tweet_queries_test.go
+++ b/persistence/tweet_queries_test.go
@ -29,12 +29,6 @@ func TestSaveAndLoadTweet(t *testing.T) {
        t.Fatalf("Failed to load the tweet: %s", err.Error())
    }

-    if diff := deep.Equal(tweet.Images, new_tweet.Images); diff != nil {
-        t.Error(diff)
-    }
-    if diff := deep.Equal(tweet.Videos, new_tweet.Videos); diff != nil {
-        t.Error(diff)
-    }
    if diff := deep.Equal(tweet, new_tweet); diff != nil {
        t.Error(diff)
    }
--- a/persistence/utils_test.go
+++ b/persistence/utils_test.go
@ -92,6 +92,26 @@ func create_video_from_id(id int) scraper.Video {
 	}
 }

+/**
+ * Create a semi-stable Url based on the given ID
+ */
+func create_url_from_id(id int) scraper.Url {
+	s := fmt.Sprint(id)
+	return scraper.Url {
+		TweetID: -1,
+		Domain: s + "domain",
+		Text: s + "text",
+		Title: s + "title",
+		Description: s + "description",
+		ThumbnailRemoteUrl: s + "remote url",
+		ThumbnailLocalPath: s + "local path",
+		CreatorID: scraper.UserID(id),
+		SiteID: scraper.UserID(id),
+		HasCard: true,
+		IsContentDownloaded: false,
+	}
+}
+
 /**
 * Create a stable tweet with a fixed ID and content
 */
@ -109,7 +129,9 @@ func create_stable_tweet() scraper.Tweet {
 		Videos: []scraper.Video{
 			create_video_from_id(-1),
 		},
-		Urls: []string{},
+		Urls: []scraper.Url{
+			create_url_from_id(-1),
+		},
 		Images: []scraper.Image{
 			create_image_from_id(-1),
 		},
@ -173,6 +195,11 @@ func create_dummy_tweet() scraper.Tweet {
 	vid := create_video_from_id(rand.Int())
 	vid.TweetID = tweet_id

+	url1 := create_url_from_id(rand.Int())
+	url1.TweetID = tweet_id
+	url2 := create_url_from_id(rand.Int())
+	url2.TweetID = tweet_id
+
 	return scraper.Tweet{
 		ID: tweet_id,
 		UserID: -1,
@ -183,7 +210,7 @@ func create_dummy_tweet() scraper.Tweet {
 		NumReplies: 3,
 		NumQuoteTweets: 4,
 		Videos: []scraper.Video{vid},
-		Urls: []string{"url1", "url2"},
+		Urls: []scraper.Url{url1, url2},
 		Images: []scraper.Image{img1, img2},
 		Mentions: []scraper.UserHandle{"mention1", "mention2"},
 		Hashtags: []string{"hash1", "hash2"},
--- a/scraper/test_responses/tweet_with_url_but_no_card.json
+++ b/scraper/test_responses/tweet_with_url_but_no_card.json
@ -0,0 +1 @@
+{"created_at":"Fri Sep 17 00:03:26 +0000 2021","id_str":"1438654793384353793","full_text":"NEW: columnist Jennifer Rubin was one of the Obama administration’s most reactionary critics\n\nNow she’s the Biden admin’s favorite columnist\nW/ @NickNiedz\n\nWe reached out to Rubin her columns and divisions at Wapo over them\n\nHer response—&gt;\nhttps://t.co/ZigZyLctwt https://t.co/KZZAK1tXhq","display_text_range":[0,266],"entities":{"user_mentions":[{"screen_name":"NickNiedz","name":"Nick Niedzwiadek","id_str":"548501303","indices":[144,154]}],"urls":[{"url":"https://t.co/ZigZyLctwt","expanded_url":"https://www.politico.com/newsletters/west-wing-playbook/2021/09/16/the-jennifer-rubin-wh-symbiosis-494364","display_url":"politico.com/newsletters/we…","indices":[243,266]}],"media":[{"id_str":"1438654789596942336","indices":[267,290],"media_url":"http://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","media_url_https":"https://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","url":"https://t.co/KZZAK1tXhq","display_url":"pic.twitter.com/KZZAK1tXhq","expanded_url":"https://twitter.com/AlexThomp/status/1438654793384353793/photo/1","type":"photo","original_info":{"width":1170,"height":1809,"focus_rects":[{"x":0,"y":0,"h":655,"w":1170},{"x":0,"y":0,"h":1170,"w":1170},{"x":0,"y":0,"h":1334,"w":1170},{"x":45,"y":0,"h":1809,"w":905},{"x":0,"y":0,"h":1809,"w":1170}]},"sizes":{"small":{"w":440,"h":680,"resize":"fit"},"medium":{"w":776,"h":1200,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1170,"h":1809,"resize":"fit"}}}]},"extended_entities":{"media":[{"id_str":"1438654789596942336","indices":[267,290],"media_url":"http://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","media_url_https":"https://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","url":"https://t.co/KZZAK1tXhq","display_url":"pic.twitter.com/KZZAK1tXhq","expanded_url":"https://twitter.com/AlexThomp/status/1438654793384353793/photo/1","type":"photo","original_info":{"width":1170,"height":1809,"focus_rects":[{"x":0,"y":0,"h":655,"w":1170},{"x":0,"y":0,"h":1170,"w":1170},{"x":0,"y":0,"h":1334,"w":1170},{"x":45,"y":0,"h":1809,"w":905},{"x":0,"y":0,"h":1809,"w":1170}]},"sizes":{"small":{"w":440,"h":680,"resize":"fit"},"medium":{"w":776,"h":1200,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1170,"h":1809,"resize":"fit"}},"media_key":"3_1438654789596942336","ext_alt_text":null,"ext_media_availability":{"status":"available"},"ext_media_color":{"palette":[{"rgb":{"red":252,"green":252,"blue":252},"percentage":99.67},{"rgb":{"red":145,"green":145,"blue":145},"percentage":0.33}]},"ext":{"mediaStats":{"r":"Missing","ttl":-1}}}]},"source":"<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>","user_id_str":"370982639","retweet_count":407,"favorite_count":1802,"reply_count":2642,"quote_count":884,"conversation_id_str":"1438654793384353793","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1438654793384353793"}}
--- a/scraper/test_responses/tweet_with_url_card.json
+++ b/scraper/test_responses/tweet_with_url_card.json
--- a/scraper/tweet.go
+++ b/scraper/tweet.go
@ -23,7 +23,7 @@ type Tweet struct {
 	NumQuoteTweets int
 	InReplyTo      TweetID

-	Urls        []string
+	Urls        []Url
 	Images      []Image
 	Videos      []Video
 	Mentions    []UserHandle
@ -63,7 +63,7 @@ Replies: %d      RT: %d      QT: %d      Likes: %d
 	if len(t.Urls) > 0 {
 		ret += "urls: [\n"
 		for _, url := range(t.Urls) {
-			ret += "  " + url + "\n"
+			ret += "  " + url.Text + "\n"
 		}
 		ret += "]"
 	}
@ -89,8 +89,18 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
 	ret.NumQuoteTweets = apiTweet.QuoteCount
 	ret.InReplyTo = TweetID(apiTweet.InReplyToStatusID)

-	for _, url := range apiTweet.Entities.URLs {
-		ret.Urls = append(ret.Urls, url.ExpandedURL)
+	for i, url := range apiTweet.Entities.URLs {
+		if i != 0 {
+			panic(fmt.Sprintf("Tweet with multiple embedded URLs: %d", apiTweet.ID))
+		}
+		var url_object Url
+		if apiTweet.Card.BindingValues.Domain.Value != "" {
+			// Using the "Domain" field to detect if there is a card
+			url_object = ParseAPIUrlCard(apiTweet.Card)
+		}
+		url_object.Text = url.ExpandedURL
+		url_object.TweetID = ret.ID
+		ret.Urls = append(ret.Urls, url_object)
 	}
 	for _, media := range apiTweet.Entities.Media {
 		if media.Type != "photo" {  // TODO: remove this eventually
--- a/scraper/tweet_test.go
+++ b/scraper/tweet_test.go
@ -52,6 +52,10 @@ func TestParseSingleTweet(t *testing.T) {
 		t.Errorf("Expected %v, got %v", []string{"michaelmalice"}, tweet.Mentions)
 	}

+	if len(tweet.Urls) != 0 {
+		t.Errorf("Expected %d urls, but got %d", 0, len(tweet.Urls))
+	}
+
 	if tweet.PostedAt.Unix() != 1621639105 {
 		t.Errorf("Expected %d, got %d", 1621639105, tweet.PostedAt.Unix())
 	}
@ -162,6 +166,66 @@ func TestParseTweetWithVideo(t *testing.T) {
 	}
 }

+func TestParseTweetWithUrl(t *testing.T) {
+	data, err := ioutil.ReadFile("test_responses/tweet_with_url_card.json")
+	if err != nil {
+		panic(err)
+	}
+	var apitweet scraper.APITweet
+	err = json.Unmarshal(data, &apitweet)
+	if err != nil {
+		t.Errorf(err.Error())
+	}
+	tweet, err := scraper.ParseSingleTweet(apitweet)
+	if err != nil {
+		t.Errorf(err.Error())
+	}
+
+	if len(tweet.Urls) != 1 {
+		t.Errorf("Expected %d urls, but got %d", 1, len(tweet.Urls))
+	}
+
+	expected_url_text := "https://reason.com/2021/08/30/la-teachers-union-cecily-myart-cruz-learning-loss/"
+	if tweet.Urls[0].Text != expected_url_text {
+		t.Errorf("Expected Url text to be %q, but got %q", expected_url_text, tweet.Urls[0].Text)
+	}
+	if !tweet.Urls[0].HasCard {
+		t.Errorf("Expected it to have a card, but it doesn't")
+	}
+	expected_url_domain := "reason.com"
+	if tweet.Urls[0].Domain != expected_url_domain {
+		t.Errorf("Expected Url text to be %q, but got %q", expected_url_domain, tweet.Urls[0].Domain)
+	}
+}
+
+func TestParseTweetWithUrlButNoCard(t *testing.T) {
+	data, err := ioutil.ReadFile("test_responses/tweet_with_url_but_no_card.json")
+	if err != nil {
+		panic(err)
+	}
+	var apitweet scraper.APITweet
+	err = json.Unmarshal(data, &apitweet)
+	if err != nil {
+		t.Errorf(err.Error())
+	}
+	tweet, err := scraper.ParseSingleTweet(apitweet)
+	if err != nil {
+		t.Errorf(err.Error())
+	}
+
+	if len(tweet.Urls) != 1 {
+		t.Errorf("Expected %d urls, but got %d", 1, len(tweet.Urls))
+	}
+
+	expected_url_text := "https://www.politico.com/newsletters/west-wing-playbook/2021/09/16/the-jennifer-rubin-wh-symbiosis-494364"
+	if tweet.Urls[0].Text != expected_url_text {
+		t.Errorf("Expected Url text to be %q, but got %q", expected_url_text, tweet.Urls[0].Text)
+	}
+	if tweet.Urls[0].HasCard {
+		t.Errorf("Expected url not to have a card, but it thinks it has one")
+	}
+}
+
 func TestParseTweetResponse(t *testing.T) {
 	data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")
 	if err != nil {
--- a/scraper/url.go
+++ b/scraper/url.go
@ -7,6 +7,8 @@ import (
 )

 type Url struct {
+	TweetID TweetID
+
 	Domain string
 	Text string
 	Title string
				`@ -0,0 +1 @@`
				{"created_at":"Fri Sep 17 00:03:26 +0000 2021","id_str":"1438654793384353793","full_text":"NEW: columnist Jennifer Rubin was one of the Obama administration’s most reactionary critics\n\nNow she’s the Biden admin’s favorite columnist\nW/ @NickNiedz\n\nWe reached out to Rubin her columns and divisions at Wapo over them\n\nHer response—>\nhttps://t.co/ZigZyLctwt https://t.co/KZZAK1tXhq","display_text_range":[0,266],"entities":{"user_mentions":[{"screen_name":"NickNiedz","name":"Nick Niedzwiadek","id_str":"548501303","indices":[144,154]}],"urls":[{"url":"https://t.co/ZigZyLctwt","expanded_url":"https://www.politico.com/newsletters/west-wing-playbook/2021/09/16/the-jennifer-rubin-wh-symbiosis-494364","display_url":"politico.com/newsletters/we…","indices":[243,266]}],"media":[{"id_str":"1438654789596942336","indices":[267,290],"media_url":"http://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","media_url_https":"https://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","url":"https://t.co/KZZAK1tXhq","display_url":"pic.twitter.com/KZZAK1tXhq","expanded_url":"https://twitter.com/AlexThomp/status/1438654793384353793/photo/1","type":"photo","original_info":{"width":1170,"height":1809,"focus_rects":[{"x":0,"y":0,"h":655,"w":1170},{"x":0,"y":0,"h":1170,"w":1170},{"x":0,"y":0,"h":1334,"w":1170},{"x":45,"y":0,"h":1809,"w":905},{"x":0,"y":0,"h":1809,"w":1170}]},"sizes":{"small":{"w":440,"h":680,"resize":"fit"},"medium":{"w":776,"h":1200,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1170,"h":1809,"resize":"fit"}}}]},"extended_entities":{"media":[{"id_str":"1438654789596942336","indices":[267,290],"media_url":"http://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","media_url_https":"https://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","url":"https://t.co/KZZAK1tXhq","display_url":"pic.twitter.com/KZZAK1tXhq","expanded_url":"https://twitter.com/AlexThomp/status/1438654793384353793/photo/1","type":"photo","original_info":{"width":1170,"height":1809,"focus_rects":[{"x":0,"y":0,"h":655,"w":1170},{"x":0,"y":0,"h":1170,"w":1170},{"x":0,"y":0,"h":1334,"w":1170},{"x":45,"y":0,"h":1809,"w":905},{"x":0,"y":0,"h":1809,"w":1170}]},"sizes":{"small":{"w":440,"h":680,"resize":"fit"},"medium":{"w":776,"h":1200,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1170,"h":1809,"resize":"fit"}},"media_key":"3_1438654789596942336","ext_alt_text":null,"ext_media_availability":{"status":"available"},"ext_media_color":{"palette":[{"rgb":{"red":252,"green":252,"blue":252},"percentage":99.67},{"rgb":{"red":145,"green":145,"blue":145},"percentage":0.33}]},"ext":{"mediaStats":{"r":"Missing","ttl":-1}}}]},"source":"<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>","user_id_str":"370982639","retweet_count":407,"favorite_count":1802,"reply_count":2642,"quote_count":884,"conversation_id_str":"1438654793384353793","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1438654793384353793"}}