Add persistence for new Url type
This commit is contained in:
parent
79f098450e
commit
05c3f2289b
16
cmd/tests.sh
16
cmd/tests.sh
@ -114,4 +114,20 @@ tw fetch_user HbdNrx
|
||||
test $(sqlite3 twitter.db "select is_private from users where handle = 'HbdNrx'") = "1"
|
||||
|
||||
|
||||
# Test tweets with URLs
|
||||
urls_count=$(sqlite3 twitter.db "select count(*) from urls")
|
||||
tw fetch_tweet https://twitter.com/CovfefeAnon/status/1428904664645394433
|
||||
urls_count_after=$(sqlite3 twitter.db "select count(*) from urls")
|
||||
test $urls_count_after = $(($urls_count + 1))
|
||||
test "$(sqlite3 twitter.db "select title from urls where tweet_id = 1428904664645394433")" = "Justice Department investigating Elon Musk's SpaceX following complaint of hiring discrimination"
|
||||
test $(sqlite3 twitter.db "select thumbnail_remote_url from urls where tweet_id = 1428904664645394433") = "https://pbs.twimg.com/card_img/1436430370946392064/WX1Rv2AJ?format=jpg&name=800x320_1"
|
||||
|
||||
# Try to double-fetch it; shouldn't duplicate the URL
|
||||
tw fetch_tweet https://twitter.com/CovfefeAnon/status/1428904664645394433
|
||||
urls_count_after_2x=$(sqlite3 twitter.db "select count(*) from urls")
|
||||
test $urls_count_after_2x = $urls_count_after
|
||||
|
||||
|
||||
# TODO: Maybe this file should be broken up into multiple test scripts
|
||||
|
||||
echo -e "\033[32mAll tests passed. Finished successfully.\033[0m"
|
||||
|
@ -42,6 +42,22 @@ func (p Profile) SaveVideo(vid scraper.Video) error {
|
||||
return err
|
||||
}
|
||||
|
||||
/**
|
||||
* Save an Url
|
||||
*/
|
||||
func (p Profile) SaveUrl(url scraper.Url) error {
|
||||
_, err := p.DB.Exec(`
|
||||
insert into urls (tweet_id, domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded)
|
||||
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
on conflict do update
|
||||
set is_content_downloaded=?
|
||||
`,
|
||||
url.TweetID, url.Domain, url.Text, url.Title, url.Description, url.CreatorID, url.SiteID, url.ThumbnailRemoteUrl, url.ThumbnailLocalPath, url.HasCard, url.IsContentDownloaded,
|
||||
url.IsContentDownloaded,
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of images for a tweet
|
||||
*/
|
||||
@ -93,3 +109,28 @@ func (p Profile) GetVideosForTweet(t scraper.Tweet) (vids []scraper.Video, err e
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of Urls for a Tweet
|
||||
*/
|
||||
func (p Profile) GetUrlsForTweet(t scraper.Tweet) (urls []scraper.Url, err error) {
|
||||
stmt, err := p.DB.Prepare("select domain, text, title, description, creator_id, site_id, thumbnail_remote_url, thumbnail_local_path, has_card, is_content_downloaded from urls where tweet_id=? order by rowid")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
defer stmt.Close()
|
||||
rows, err := stmt.Query(t.ID)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
var url scraper.Url
|
||||
for rows.Next() {
|
||||
err = rows.Scan(&url.Domain, &url.Text, &url.Title, &url.Description, &url.CreatorID, &url.SiteID, &url.ThumbnailRemoteUrl, &url.ThumbnailLocalPath, &url.HasCard, &url.IsContentDownloaded)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
url.TweetID = t.ID
|
||||
urls = append(urls, url)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
@ -165,3 +165,81 @@ func TestModifyVideo(t *testing.T) {
|
||||
t.Error(diff)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Create an Url, save it, reload it, and make sure it comes back the same
|
||||
*/
|
||||
func TestSaveAndLoadUrl(t *testing.T) {
|
||||
profile_path := "test_profiles/TestMediaQueries"
|
||||
profile := create_or_load_profile(profile_path)
|
||||
|
||||
tweet := create_stable_tweet()
|
||||
|
||||
// Create a fresh Url to test on
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
url := create_url_from_id(rand.Int())
|
||||
url.TweetID = tweet.ID
|
||||
|
||||
// Save the Url
|
||||
err := profile.SaveUrl(url)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to save the url: %s", err.Error())
|
||||
}
|
||||
|
||||
// Reload the Url
|
||||
urls, err := profile.GetUrlsForTweet(tweet)
|
||||
if err != nil {
|
||||
t.Fatalf("Could not load urls: %s", err.Error())
|
||||
}
|
||||
|
||||
var new_url scraper.Url
|
||||
for index := range urls {
|
||||
if urls[index].Text == url.Text {
|
||||
new_url = urls[index]
|
||||
}
|
||||
}
|
||||
if new_url.Text != url.Text {
|
||||
t.Fatalf("Could not find url for some reason: %s, %s; %+v", new_url.Text, url.Text, urls)
|
||||
}
|
||||
if diff := deep.Equal(url, new_url); diff != nil {
|
||||
t.Error(diff)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Change an Url, save the changes, reload it, and check if it comes back the same
|
||||
*/
|
||||
func TestModifyUrl(t *testing.T) {
|
||||
profile_path := "test_profiles/TestMediaQueries"
|
||||
profile := create_or_load_profile(profile_path)
|
||||
|
||||
tweet := create_stable_tweet()
|
||||
url := tweet.Urls[0]
|
||||
|
||||
if url.Text != "-1text" {
|
||||
t.Fatalf("Got the wrong url back: wanted %s, got %s!", "-1text", url.Text)
|
||||
}
|
||||
|
||||
url.IsContentDownloaded = true
|
||||
|
||||
// Save the changes
|
||||
err := profile.SaveUrl(url)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
// Reload it
|
||||
urls, err := profile.GetUrlsForTweet(tweet)
|
||||
if err != nil {
|
||||
t.Fatalf("Could not load urls: %s", err.Error())
|
||||
}
|
||||
new_url := urls[0]
|
||||
if new_url.Text != "-1text" {
|
||||
t.Fatalf("Got the wrong url back: wanted %s, got %s!", "-1text", new_url.Text)
|
||||
}
|
||||
|
||||
if diff := deep.Equal(url, new_url); diff != nil {
|
||||
t.Error(diff)
|
||||
}
|
||||
}
|
||||
|
@ -54,10 +54,21 @@ create table retweets(rowid integer primary key,
|
||||
|
||||
create table urls (rowid integer primary key,
|
||||
tweet_id integer not null,
|
||||
domain text,
|
||||
text text not null,
|
||||
title text,
|
||||
description text,
|
||||
creator_id integer,
|
||||
site_id integer,
|
||||
thumbnail_remote_url text,
|
||||
thumbnail_local_path text,
|
||||
has_card boolean,
|
||||
is_content_downloaded boolean default 0,
|
||||
|
||||
unique (tweet_id, text)
|
||||
foreign key(tweet_id) references tweets(id)
|
||||
-- foreign key(creator_id) references users(id)
|
||||
-- foreign key(site_id) references users(id)
|
||||
);
|
||||
|
||||
create table images (rowid integer primary key,
|
||||
|
@ -1,7 +1,6 @@
|
||||
package persistence
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
"strings"
|
||||
"database/sql"
|
||||
@ -34,7 +33,7 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
|
||||
return err
|
||||
}
|
||||
for _, url := range t.Urls {
|
||||
_, err := db.Exec("insert into urls (tweet_id, text) values (?, ?) on conflict do nothing", t.ID, url)
|
||||
err := p.SaveUrl(url)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@ -80,29 +79,6 @@ func (p Profile) IsTweetInDatabase(id scraper.TweetID) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func (p Profile) attach_urls(t *scraper.Tweet) error {
|
||||
println("Attaching urls")
|
||||
stmt, err := p.DB.Prepare("select text from urls where tweet_id = ?")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer stmt.Close()
|
||||
rows, err := stmt.Query(t.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var url string
|
||||
for rows.Next() {
|
||||
err = rows.Scan(&url)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
t.Urls = append(t.Urls, url)
|
||||
fmt.Printf("%v\n", t.Urls)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
|
||||
db := p.DB
|
||||
|
||||
@ -146,7 +122,9 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
|
||||
}
|
||||
t.Videos = vids
|
||||
|
||||
err = p.attach_urls(&t)
|
||||
urls, err := p.GetUrlsForTweet(t)
|
||||
t.Urls = urls
|
||||
|
||||
return t, err
|
||||
}
|
||||
|
||||
|
@ -29,12 +29,6 @@ func TestSaveAndLoadTweet(t *testing.T) {
|
||||
t.Fatalf("Failed to load the tweet: %s", err.Error())
|
||||
}
|
||||
|
||||
if diff := deep.Equal(tweet.Images, new_tweet.Images); diff != nil {
|
||||
t.Error(diff)
|
||||
}
|
||||
if diff := deep.Equal(tweet.Videos, new_tweet.Videos); diff != nil {
|
||||
t.Error(diff)
|
||||
}
|
||||
if diff := deep.Equal(tweet, new_tweet); diff != nil {
|
||||
t.Error(diff)
|
||||
}
|
||||
|
@ -92,6 +92,26 @@ func create_video_from_id(id int) scraper.Video {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a semi-stable Url based on the given ID
|
||||
*/
|
||||
func create_url_from_id(id int) scraper.Url {
|
||||
s := fmt.Sprint(id)
|
||||
return scraper.Url {
|
||||
TweetID: -1,
|
||||
Domain: s + "domain",
|
||||
Text: s + "text",
|
||||
Title: s + "title",
|
||||
Description: s + "description",
|
||||
ThumbnailRemoteUrl: s + "remote url",
|
||||
ThumbnailLocalPath: s + "local path",
|
||||
CreatorID: scraper.UserID(id),
|
||||
SiteID: scraper.UserID(id),
|
||||
HasCard: true,
|
||||
IsContentDownloaded: false,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a stable tweet with a fixed ID and content
|
||||
*/
|
||||
@ -109,7 +129,9 @@ func create_stable_tweet() scraper.Tweet {
|
||||
Videos: []scraper.Video{
|
||||
create_video_from_id(-1),
|
||||
},
|
||||
Urls: []string{},
|
||||
Urls: []scraper.Url{
|
||||
create_url_from_id(-1),
|
||||
},
|
||||
Images: []scraper.Image{
|
||||
create_image_from_id(-1),
|
||||
},
|
||||
@ -173,6 +195,11 @@ func create_dummy_tweet() scraper.Tweet {
|
||||
vid := create_video_from_id(rand.Int())
|
||||
vid.TweetID = tweet_id
|
||||
|
||||
url1 := create_url_from_id(rand.Int())
|
||||
url1.TweetID = tweet_id
|
||||
url2 := create_url_from_id(rand.Int())
|
||||
url2.TweetID = tweet_id
|
||||
|
||||
return scraper.Tweet{
|
||||
ID: tweet_id,
|
||||
UserID: -1,
|
||||
@ -183,7 +210,7 @@ func create_dummy_tweet() scraper.Tweet {
|
||||
NumReplies: 3,
|
||||
NumQuoteTweets: 4,
|
||||
Videos: []scraper.Video{vid},
|
||||
Urls: []string{"url1", "url2"},
|
||||
Urls: []scraper.Url{url1, url2},
|
||||
Images: []scraper.Image{img1, img2},
|
||||
Mentions: []scraper.UserHandle{"mention1", "mention2"},
|
||||
Hashtags: []string{"hash1", "hash2"},
|
||||
|
1
scraper/test_responses/tweet_with_url_but_no_card.json
Normal file
1
scraper/test_responses/tweet_with_url_but_no_card.json
Normal file
@ -0,0 +1 @@
|
||||
{"created_at":"Fri Sep 17 00:03:26 +0000 2021","id_str":"1438654793384353793","full_text":"NEW: columnist Jennifer Rubin was one of the Obama administration’s most reactionary critics\n\nNow she’s the Biden admin’s favorite columnist\nW/ @NickNiedz\n\nWe reached out to Rubin her columns and divisions at Wapo over them\n\nHer response—>\nhttps://t.co/ZigZyLctwt https://t.co/KZZAK1tXhq","display_text_range":[0,266],"entities":{"user_mentions":[{"screen_name":"NickNiedz","name":"Nick Niedzwiadek","id_str":"548501303","indices":[144,154]}],"urls":[{"url":"https://t.co/ZigZyLctwt","expanded_url":"https://www.politico.com/newsletters/west-wing-playbook/2021/09/16/the-jennifer-rubin-wh-symbiosis-494364","display_url":"politico.com/newsletters/we…","indices":[243,266]}],"media":[{"id_str":"1438654789596942336","indices":[267,290],"media_url":"http://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","media_url_https":"https://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","url":"https://t.co/KZZAK1tXhq","display_url":"pic.twitter.com/KZZAK1tXhq","expanded_url":"https://twitter.com/AlexThomp/status/1438654793384353793/photo/1","type":"photo","original_info":{"width":1170,"height":1809,"focus_rects":[{"x":0,"y":0,"h":655,"w":1170},{"x":0,"y":0,"h":1170,"w":1170},{"x":0,"y":0,"h":1334,"w":1170},{"x":45,"y":0,"h":1809,"w":905},{"x":0,"y":0,"h":1809,"w":1170}]},"sizes":{"small":{"w":440,"h":680,"resize":"fit"},"medium":{"w":776,"h":1200,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1170,"h":1809,"resize":"fit"}}}]},"extended_entities":{"media":[{"id_str":"1438654789596942336","indices":[267,290],"media_url":"http://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","media_url_https":"https://pbs.twimg.com/media/E_cg6KhXEAAyPjY.jpg","url":"https://t.co/KZZAK1tXhq","display_url":"pic.twitter.com/KZZAK1tXhq","expanded_url":"https://twitter.com/AlexThomp/status/1438654793384353793/photo/1","type":"photo","original_info":{"width":1170,"height":1809,"focus_rects":[{"x":0,"y":0,"h":655,"w":1170},{"x":0,"y":0,"h":1170,"w":1170},{"x":0,"y":0,"h":1334,"w":1170},{"x":45,"y":0,"h":1809,"w":905},{"x":0,"y":0,"h":1809,"w":1170}]},"sizes":{"small":{"w":440,"h":680,"resize":"fit"},"medium":{"w":776,"h":1200,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1170,"h":1809,"resize":"fit"}},"media_key":"3_1438654789596942336","ext_alt_text":null,"ext_media_availability":{"status":"available"},"ext_media_color":{"palette":[{"rgb":{"red":252,"green":252,"blue":252},"percentage":99.67},{"rgb":{"red":145,"green":145,"blue":145},"percentage":0.33}]},"ext":{"mediaStats":{"r":"Missing","ttl":-1}}}]},"source":"<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>","user_id_str":"370982639","retweet_count":407,"favorite_count":1802,"reply_count":2642,"quote_count":884,"conversation_id_str":"1438654793384353793","possibly_sensitive_editable":true,"lang":"en","self_thread":{"id_str":"1438654793384353793"}}
|
1
scraper/test_responses/tweet_with_url_card.json
Normal file
1
scraper/test_responses/tweet_with_url_card.json
Normal file
File diff suppressed because one or more lines are too long
@ -23,7 +23,7 @@ type Tweet struct {
|
||||
NumQuoteTweets int
|
||||
InReplyTo TweetID
|
||||
|
||||
Urls []string
|
||||
Urls []Url
|
||||
Images []Image
|
||||
Videos []Video
|
||||
Mentions []UserHandle
|
||||
@ -63,7 +63,7 @@ Replies: %d RT: %d QT: %d Likes: %d
|
||||
if len(t.Urls) > 0 {
|
||||
ret += "urls: [\n"
|
||||
for _, url := range(t.Urls) {
|
||||
ret += " " + url + "\n"
|
||||
ret += " " + url.Text + "\n"
|
||||
}
|
||||
ret += "]"
|
||||
}
|
||||
@ -89,8 +89,18 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
||||
ret.NumQuoteTweets = apiTweet.QuoteCount
|
||||
ret.InReplyTo = TweetID(apiTweet.InReplyToStatusID)
|
||||
|
||||
for _, url := range apiTweet.Entities.URLs {
|
||||
ret.Urls = append(ret.Urls, url.ExpandedURL)
|
||||
for i, url := range apiTweet.Entities.URLs {
|
||||
if i != 0 {
|
||||
panic(fmt.Sprintf("Tweet with multiple embedded URLs: %d", apiTweet.ID))
|
||||
}
|
||||
var url_object Url
|
||||
if apiTweet.Card.BindingValues.Domain.Value != "" {
|
||||
// Using the "Domain" field to detect if there is a card
|
||||
url_object = ParseAPIUrlCard(apiTweet.Card)
|
||||
}
|
||||
url_object.Text = url.ExpandedURL
|
||||
url_object.TweetID = ret.ID
|
||||
ret.Urls = append(ret.Urls, url_object)
|
||||
}
|
||||
for _, media := range apiTweet.Entities.Media {
|
||||
if media.Type != "photo" { // TODO: remove this eventually
|
||||
|
@ -52,6 +52,10 @@ func TestParseSingleTweet(t *testing.T) {
|
||||
t.Errorf("Expected %v, got %v", []string{"michaelmalice"}, tweet.Mentions)
|
||||
}
|
||||
|
||||
if len(tweet.Urls) != 0 {
|
||||
t.Errorf("Expected %d urls, but got %d", 0, len(tweet.Urls))
|
||||
}
|
||||
|
||||
if tweet.PostedAt.Unix() != 1621639105 {
|
||||
t.Errorf("Expected %d, got %d", 1621639105, tweet.PostedAt.Unix())
|
||||
}
|
||||
@ -162,6 +166,66 @@ func TestParseTweetWithVideo(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseTweetWithUrl(t *testing.T) {
|
||||
data, err := ioutil.ReadFile("test_responses/tweet_with_url_card.json")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
var apitweet scraper.APITweet
|
||||
err = json.Unmarshal(data, &apitweet)
|
||||
if err != nil {
|
||||
t.Errorf(err.Error())
|
||||
}
|
||||
tweet, err := scraper.ParseSingleTweet(apitweet)
|
||||
if err != nil {
|
||||
t.Errorf(err.Error())
|
||||
}
|
||||
|
||||
if len(tweet.Urls) != 1 {
|
||||
t.Errorf("Expected %d urls, but got %d", 1, len(tweet.Urls))
|
||||
}
|
||||
|
||||
expected_url_text := "https://reason.com/2021/08/30/la-teachers-union-cecily-myart-cruz-learning-loss/"
|
||||
if tweet.Urls[0].Text != expected_url_text {
|
||||
t.Errorf("Expected Url text to be %q, but got %q", expected_url_text, tweet.Urls[0].Text)
|
||||
}
|
||||
if !tweet.Urls[0].HasCard {
|
||||
t.Errorf("Expected it to have a card, but it doesn't")
|
||||
}
|
||||
expected_url_domain := "reason.com"
|
||||
if tweet.Urls[0].Domain != expected_url_domain {
|
||||
t.Errorf("Expected Url text to be %q, but got %q", expected_url_domain, tweet.Urls[0].Domain)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseTweetWithUrlButNoCard(t *testing.T) {
|
||||
data, err := ioutil.ReadFile("test_responses/tweet_with_url_but_no_card.json")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
var apitweet scraper.APITweet
|
||||
err = json.Unmarshal(data, &apitweet)
|
||||
if err != nil {
|
||||
t.Errorf(err.Error())
|
||||
}
|
||||
tweet, err := scraper.ParseSingleTweet(apitweet)
|
||||
if err != nil {
|
||||
t.Errorf(err.Error())
|
||||
}
|
||||
|
||||
if len(tweet.Urls) != 1 {
|
||||
t.Errorf("Expected %d urls, but got %d", 1, len(tweet.Urls))
|
||||
}
|
||||
|
||||
expected_url_text := "https://www.politico.com/newsletters/west-wing-playbook/2021/09/16/the-jennifer-rubin-wh-symbiosis-494364"
|
||||
if tweet.Urls[0].Text != expected_url_text {
|
||||
t.Errorf("Expected Url text to be %q, but got %q", expected_url_text, tweet.Urls[0].Text)
|
||||
}
|
||||
if tweet.Urls[0].HasCard {
|
||||
t.Errorf("Expected url not to have a card, but it thinks it has one")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseTweetResponse(t *testing.T) {
|
||||
data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")
|
||||
if err != nil {
|
||||
|
@ -7,6 +7,8 @@ import (
|
||||
)
|
||||
|
||||
type Url struct {
|
||||
TweetID TweetID
|
||||
|
||||
Domain string
|
||||
Text string
|
||||
Title string
|
||||
|
Loading…
x
Reference in New Issue
Block a user