Add persistence for new Url type

This commit is contained in:
Alessio 2021-09-17 18:04:12 -07:00
parent 79f098450e
commit 05c3f2289b
12 changed files with 261 additions and 38 deletions

View File

@ -114,4 +114,20 @@ tw fetch_user HbdNrx
test $(sqlite3 twitter.db "select is_private from users where handle = 'HbdNrx'") = "1"
# Test tweets with URLs
urls_count=$(sqlite3 twitter.db "select count(*) from urls")
tw fetch_tweet https://twitter.com/CovfefeAnon/status/1428904664645394433
urls_count_after=$(sqlite3 twitter.db "select count(*) from urls")
test $urls_count_after = $(($urls_count + 1))
test "$(sqlite3 twitter.db "select title from urls where tweet_id = 1428904664645394433")" = "Justice Department investigating Elon Musk's SpaceX following complaint of hiring discrimination"
test $(sqlite3 twitter.db "select thumbnail_remote_url from urls where tweet_id = 1428904664645394433") = "https://pbs.twimg.com/card_img/1436430370946392064/WX1Rv2AJ?format=jpg&name=800x320_1"
# Try to double-fetch it; shouldn't duplicate the URL
tw fetch_tweet https://twitter.com/CovfefeAnon/status/1428904664645394433
urls_count_after_2x=$(sqlite3 twitter.db "select count(*) from urls")
test $urls_count_after_2x = $urls_count_after
# TODO: Maybe this file should be broken up into multiple test scripts
echo -e "\033[32mAll tests passed. Finished successfully.\033[0m"

View File

@ -42,6 +42,22 @@ func (p Profile) SaveVideo(vid scraper.Video) error {
return err
}
/**
* Save an Url
*/
func (p Profile) SaveUrl(url scraper.Url) error {
_, err := p.DB.Exec(`
insert into urls (tweet_id, domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded)
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
on conflict do update
set is_content_downloaded=?
`,
url.TweetID, url.Domain, url.Text, url.Title, url.Description, url.CreatorID, url.SiteID, url.ThumbnailRemoteUrl, url.ThumbnailLocalPath, url.HasCard, url.IsContentDownloaded,
url.IsContentDownloaded,
)
return err
}
/**
* Get the list of images for a tweet
*/
@ -93,3 +109,28 @@ func (p Profile) GetVideosForTweet(t scraper.Tweet) (vids []scraper.Video, err e
}
return
}
/**
* Get the list of Urls for a Tweet
*/
func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error) {
stmt, err := p.DB.Prepare("select domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded from urls where tweet_id=? order by rowid")
if err != nil {
return
}
defer stmt.Close()
rows, err := stmt.Query(t.ID)
if err != nil {
return
}
var url scraper.Url
for rows.Next() {
err = rows.Scan(&url.Domain, &url.Text, &url.Title, &url.Description, &url.CreatorID, &url.SiteID, &url.ThumbnailRemoteUrl, &url.ThumbnailLocalPath, &url.HasCard, &url.IsContentDownloaded)
if err != nil {
return
}
url.TweetID = t.ID
urls = append(urls, url)
}
return
}

View File

@ -165,3 +165,81 @@ func TestModifyVideo(t *testing.T) {
t.Error(diff)
}
}
/**
* Create an Url, save it, reload it, and make sure it comes back the same
*/
func TestSaveAndLoadUrl(t *testing.T) {
profile_path := "test_profiles/TestMediaQueries"
profile := create_or_load_profile(profile_path)
tweet := create_stable_tweet()
// Create a fresh Url to test on
rand.Seed(time.Now().UnixNano())
url := create_url_from_id(rand.Int())
url.TweetID = tweet.ID
// Save the Url
err := profile.SaveUrl(url)
if err != nil {
t.Fatalf("Failed to save the url: %s", err.Error())
}
// Reload the Url
urls, err := profile.GetUrlsForTweet(tweet)
if err != nil {
t.Fatalf("Could not load urls: %s", err.Error())
}
var new_url scraper.Url
for index := range urls {
if urls[index].Text == url.Text {
new_url = urls[index]
}
}
if new_url.Text != url.Text {
t.Fatalf("Could not find url for some reason: %s, %s; %+v", new_url.Text, url.Text, urls)
}
if diff := deep.Equal(url, new_url); diff != nil {
t.Error(diff)
}
}
/**
* Change an Url, save the changes, reload it, and check if it comes back the same
*/
func TestModifyUrl(t *testing.T) {
profile_path := "test_profiles/TestMediaQueries"
profile := create_or_load_profile(profile_path)
tweet := create_stable_tweet()
url := tweet.Urls[0]
if url.Text != "-1text" {
t.Fatalf("Got the wrong url back: wanted %s, got %s!", "-1text", url.Text)
}
url.IsContentDownloaded = true
// Save the changes
err := profile.SaveUrl(url)
if err != nil {
t.Error(err)
}
// Reload it
urls, err := profile.GetUrlsForTweet(tweet)
if err != nil {
t.Fatalf("Could not load urls: %s", err.Error())
}
new_url := urls[0]
if new_url.Text != "-1text" {
t.Fatalf("Got the wrong url back: wanted %s, got %s!", "-1text", new_url.Text)
}
if diff := deep.Equal(url, new_url); diff != nil {
t.Error(diff)
}
}

View File

@ -54,10 +54,21 @@ create table retweets(rowid integer primary key,
create table urls (rowid integer primary key,
tweet_id integer not null,
domain text,
text text not null,
title text,
description text,
creator_id integer,
site_id integer,
thumbnail_remote_url text,
thumbnail_local_path text,
has_card boolean,
is_content_downloaded boolean default 0,
unique (tweet_id, text)
foreign key(tweet_id) references tweets(id)
-- foreign key(creator_id) references users(id)
-- foreign key(site_id) references users(id)
);
create table images (rowid integer primary key,

View File

@ -1,7 +1,6 @@
package persistence
import (
"fmt"
"time"
"strings"
"database/sql"
@ -34,7 +33,7 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
return err
}
for _, url := range t.Urls {
_, err := db.Exec("insert into urls (tweet_id, text) values (?, ?) on conflict do nothing", t.ID, url)
err := p.SaveUrl(url)
if err != nil {
return err
}
@ -80,29 +79,6 @@ func (p Profile) IsTweetInDatabase(id scraper.TweetID) bool {
return true
}
func (p Profile) attach_urls(t *scraper.Tweet) error {
println("Attaching urls")
stmt, err := p.DB.Prepare("select text from urls where tweet_id = ?")
if err != nil {
return err
}
defer stmt.Close()
rows, err := stmt.Query(t.ID)
if err != nil {
return err
}
var url string
for rows.Next() {
err = rows.Scan(&url)
if err != nil {
return err
}
t.Urls = append(t.Urls, url)
fmt.Printf("%v\n", t.Urls)
}
return nil
}
func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
db := p.DB
@ -146,7 +122,9 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
}
t.Videos = vids
err = p.attach_urls(&t)
urls, err := p.GetUrlsForTweet(t)
t.Urls = urls
return t, err
}

View File

@ -29,12 +29,6 @@ func TestSaveAndLoadTweet(t *testing.T) {
t.Fatalf("Failed to load the tweet: %s", err.Error())
}
if diff := deep.Equal(tweet.Images, new_tweet.Images); diff != nil {
t.Error(diff)
}
if diff := deep.Equal(tweet.Videos, new_tweet.Videos); diff != nil {
t.Error(diff)
}
if diff := deep.Equal(tweet, new_tweet); diff != nil {
t.Error(diff)
}

View File

@ -92,6 +92,26 @@ func create_video_from_id(id int) scraper.Video {
}
}
/**
* Create a semi-stable Url based on the given ID
*/
func create_url_from_id(id int) scraper.Url {
s := fmt.Sprint(id)
return scraper.Url {
TweetID: -1,
Domain: s + "domain",
Text: s + "text",
Title: s + "title",
Description: s + "description",
ThumbnailRemoteUrl: s + "remote url",
ThumbnailLocalPath: s + "local path",
CreatorID: scraper.UserID(id),
SiteID: scraper.UserID(id),
HasCard: true,
IsContentDownloaded: false,
}
}
/**
* Create a stable tweet with a fixed ID and content
*/
@ -109,7 +129,9 @@ func create_stable_tweet() scraper.Tweet {
Videos: []scraper.Video{
create_video_from_id(-1),
},
Urls: []string{},
Urls: []scraper.Url{
create_url_from_id(-1),
},
Images: []scraper.Image{
create_image_from_id(-1),
},
@ -173,6 +195,11 @@ func create_dummy_tweet() scraper.Tweet {
vid := create_video_from_id(rand.Int())
vid.TweetID = tweet_id
url1 := create_url_from_id(rand.Int())
url1.TweetID = tweet_id
url2 := create_url_from_id(rand.Int())
url2.TweetID = tweet_id
return scraper.Tweet{
ID: tweet_id,
UserID: -1,
@ -183,7 +210,7 @@ func create_dummy_tweet() scraper.Tweet {
NumReplies: 3,
NumQuoteTweets: 4,
Videos: []scraper.Video{vid},
Urls: []string{"url1", "url2"},
Urls: []scraper.Url{url1, url2},
Images: []scraper.Image{img1, img2},
Mentions: []scraper.UserHandle{"mention1", "mention2"},
Hashtags: []string{"hash1", "hash2"},

View File

@ -0,0 +1 @@
{"created_at":"Fri Sep 17 00:03:26 +0000 2021","id_str":"1438654793384353793","full_text":"NEW: columnist Jennifer Rubin was one of the Obama administrations most reactionary critics\n\nNow shes the Biden admins favorite columnist\nW/ @NickNiedz\n\nWe reached out to Rubin her columns and divisions at Wapo over them\n\nHer response—&gt;\nhttps://t.co/ZigZyLctwt https://t.co/KZZAK1tXhq","display_text_range":[0,266],"entities":{"user_mentions":[{"screen_name":"NickNiedz","name":"Nick Niedzwiadek","id_str":"548501303","indices":[144,154]}],"urls":[{"url":"https://t.co/ZigZyLctwt","expanded_url":"https://www.politico.com/newsletters/west-wing-playbook/2021/09/16/the-jennifer-rubin-wh-symbiosis-494364","display_url":"politico.com/newsletters/we…","indices":[243,266]}],"media":[{"id_str":"1438654789596942336","indices":[267,290],"media_url":"http://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","media_url_https":"https://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","url":"https://t.co/KZZAK1tXhq","display_url":"pic.twitter.com/KZZAK1tXhq","expanded_url":"https://twitter.com/AlexThomp/status/1438654793384353793/photo/1","type":"photo","original_info":{"width":1170,"height":1809,"focus_rects":[{"x":0,"y":0,"h":655,"w":1170},{"x":0,"y":0,"h":1170,"w":1170},{"x":0,"y":0,"h":1334,"w":1170},{"x":45,"y":0,"h":1809,"w":905},{"x":0,"y":0,"h":1809,"w":1170}]},"sizes":{"small":{"w":440,"h":680,"resize":"fit"},"medium":{"w":776,"h":1200,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1170,"h":1809,"resize":"fit"}}}]},"extended_entities":{"media":[{"id_str":"1438654789596942336","indices":[267,290],"media_url":"http://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","media_url_https":"https://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","url":"https://t.co/KZZAK1tXhq","display_url":"pic.twitter.com/KZZAK1tXhq","expanded_url":"https://twitter.com/AlexThomp/status/1438654793384353793/photo/1","type":"photo","original_info":{"width":1170,"height":1809,"focus_rects":[{"x":0,"y":0,"h":655,"w":1170},{"x":0,"y":0,"h":1170,"w":1170},{"x":0,"y":0,"h":1334,"w":1170},{"x":45,"y":0,"h":1809,"w":905},{"x":0,"y":0,"h":1809,"w":1170}]},"sizes":{"small":{"w":440,"h":680,"resize":"fit"},"medium":{"w":776,"h":1200,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1170,"h":1809,"resize":"fit"}},"media_key":"3_1438654789596942336","ext_alt_text":null,"ext_media_availability":{"status":"available"},"ext_media_color":{"palette":[{"rgb":{"red":252,"green":252,"blue":252},"percentage":99.67},{"rgb":{"red":145,"green":145,"blue":145},"percentage":0.33}]},"ext":{"mediaStats":{"r":"Missing","ttl":-1}}}]},"source":"<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>","user_id_str":"370982639","retweet_count":407,"favorite_count":1802,"reply_count":2642,"quote_count":884,"conversation_id_str":"1438654793384353793","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1438654793384353793"}}

File diff suppressed because one or more lines are too long

View File

@ -23,7 +23,7 @@ type Tweet struct {
NumQuoteTweets int
InReplyTo TweetID
Urls []string
Urls []Url
Images []Image
Videos []Video
Mentions []UserHandle
@ -63,7 +63,7 @@ Replies: %d RT: %d QT: %d Likes: %d
if len(t.Urls) > 0 {
ret += "urls: [\n"
for _, url := range(t.Urls) {
ret += " " + url + "\n"
ret += " " + url.Text + "\n"
}
ret += "]"
}
@ -89,8 +89,18 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
ret.NumQuoteTweets = apiTweet.QuoteCount
ret.InReplyTo = TweetID(apiTweet.InReplyToStatusID)
for _, url := range apiTweet.Entities.URLs {
ret.Urls = append(ret.Urls, url.ExpandedURL)
for i, url := range apiTweet.Entities.URLs {
if i != 0 {
panic(fmt.Sprintf("Tweet with multiple embedded URLs: %d", apiTweet.ID))
}
var url_object Url
if apiTweet.Card.BindingValues.Domain.Value != "" {
// Using the "Domain" field to detect if there is a card
url_object = ParseAPIUrlCard(apiTweet.Card)
}
url_object.Text = url.ExpandedURL
url_object.TweetID = ret.ID
ret.Urls = append(ret.Urls, url_object)
}
for _, media := range apiTweet.Entities.Media {
if media.Type != "photo" { // TODO: remove this eventually

View File

@ -52,6 +52,10 @@ func TestParseSingleTweet(t *testing.T) {
t.Errorf("Expected %v, got %v", []string{"michaelmalice"}, tweet.Mentions)
}
if len(tweet.Urls) != 0 {
t.Errorf("Expected %d urls, but got %d", 0, len(tweet.Urls))
}
if tweet.PostedAt.Unix() != 1621639105 {
t.Errorf("Expected %d, got %d", 1621639105, tweet.PostedAt.Unix())
}
@ -162,6 +166,66 @@ func TestParseTweetWithVideo(t *testing.T) {
}
}
func TestParseTweetWithUrl(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/tweet_with_url_card.json")
if err != nil {
panic(err)
}
var apitweet scraper.APITweet
err = json.Unmarshal(data, &apitweet)
if err != nil {
t.Errorf(err.Error())
}
tweet, err := scraper.ParseSingleTweet(apitweet)
if err != nil {
t.Errorf(err.Error())
}
if len(tweet.Urls) != 1 {
t.Errorf("Expected %d urls, but got %d", 1, len(tweet.Urls))
}
expected_url_text := "https://reason.com/2021/08/30/la-teachers-union-cecily-myart-cruz-learning-loss/"
if tweet.Urls[0].Text != expected_url_text {
t.Errorf("Expected Url text to be %q, but got %q", expected_url_text, tweet.Urls[0].Text)
}
if !tweet.Urls[0].HasCard {
t.Errorf("Expected it to have a card, but it doesn't")
}
expected_url_domain := "reason.com"
if tweet.Urls[0].Domain != expected_url_domain {
t.Errorf("Expected Url text to be %q, but got %q", expected_url_domain, tweet.Urls[0].Domain)
}
}
func TestParseTweetWithUrlButNoCard(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/tweet_with_url_but_no_card.json")
if err != nil {
panic(err)
}
var apitweet scraper.APITweet
err = json.Unmarshal(data, &apitweet)
if err != nil {
t.Errorf(err.Error())
}
tweet, err := scraper.ParseSingleTweet(apitweet)
if err != nil {
t.Errorf(err.Error())
}
if len(tweet.Urls) != 1 {
t.Errorf("Expected %d urls, but got %d", 1, len(tweet.Urls))
}
expected_url_text := "https://www.politico.com/newsletters/west-wing-playbook/2021/09/16/the-jennifer-rubin-wh-symbiosis-494364"
if tweet.Urls[0].Text != expected_url_text {
t.Errorf("Expected Url text to be %q, but got %q", expected_url_text, tweet.Urls[0].Text)
}
if tweet.Urls[0].HasCard {
t.Errorf("Expected url not to have a card, but it thinks it has one")
}
}
func TestParseTweetResponse(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")
if err != nil {

View File

@ -7,6 +7,8 @@ import (
)
type Url struct {
TweetID TweetID
Domain string
Text string
Title string