Make fetching very aggressive by default

This commit is contained in:
Alessio 2021-10-10 15:13:32 -07:00
parent 66daacec34
commit c219652dcc
3 changed files with 29 additions and 7 deletions

View File

@ -137,8 +137,9 @@ test $(sqlite3 twitter.db "select is_private from users where handle = 'HbdNrx'"
# Test tweets with URLs # Test tweets with URLs
tw fetch_user CovfefeAnon
urls_count=$(sqlite3 twitter.db "select count(*) from urls") urls_count=$(sqlite3 twitter.db "select count(*) from urls")
tw fetch_tweet https://twitter.com/CovfefeAnon/status/1428904664645394433 tw fetch_tweet_only https://twitter.com/CovfefeAnon/status/1428904664645394433
urls_count_after=$(sqlite3 twitter.db "select count(*) from urls") urls_count_after=$(sqlite3 twitter.db "select count(*) from urls")
test $urls_count_after = $(($urls_count + 1)) test $urls_count_after = $(($urls_count + 1))
test "$(sqlite3 twitter.db "select title from urls where tweet_id = 1428904664645394433")" = "Justice Department investigating Elon Musk's SpaceX following complaint of hiring discrimination" test "$(sqlite3 twitter.db "select title from urls where tweet_id = 1428904664645394433")" = "Justice Department investigating Elon Musk's SpaceX following complaint of hiring discrimination"
@ -147,7 +148,7 @@ thumbnail_name=$(sqlite3 twitter.db "select thumbnail_remote_url from urls where
test -n "$thumbnail_name" # Not testing for what the thumbnail url is because it keeps changing test -n "$thumbnail_name" # Not testing for what the thumbnail url is because it keeps changing
# Try to double-fetch it; shouldn't duplicate the URL # Try to double-fetch it; shouldn't duplicate the URL
tw fetch_tweet https://twitter.com/CovfefeAnon/status/1428904664645394433 tw fetch_tweet_only https://twitter.com/CovfefeAnon/status/1428904664645394433
urls_count_after_2x=$(sqlite3 twitter.db "select count(*) from urls") urls_count_after_2x=$(sqlite3 twitter.db "select count(*) from urls")
test $urls_count_after_2x = $urls_count_after test $urls_count_after_2x = $urls_count_after
@ -163,7 +164,8 @@ test -f link_preview_images/${thumbnail_name}_800x320_1.jpg
# Test a tweet with a URL but no thumbnail # Test a tweet with a URL but no thumbnail
tw fetch_tweet https://twitter.com/Xirong7/status/1413665734866186243 tw fetch_user Xirong7
tw fetch_tweet_only https://twitter.com/Xirong7/status/1413665734866186243
test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1413665734866186243") = "0" test $(sqlite3 twitter.db "select is_content_downloaded from urls where tweet_id = 1413665734866186243") = "0"
test $(sqlite3 twitter.db "select has_thumbnail from urls where tweet_id = 1413665734866186243") = "0" test $(sqlite3 twitter.db "select has_thumbnail from urls where tweet_id = 1413665734866186243") = "0"
initial_link_preview_images_count=$(find link_preview_images | wc -l) # Check that it doesn't change, since there's no thumbnail initial_link_preview_images_count=$(find link_preview_images | wc -l) # Check that it doesn't change, since there's no thumbnail

View File

@ -155,19 +155,26 @@ func fetch_tweet_conversation(tweet_identifier string) {
} }
for _, u := range users { for _, u := range users {
fmt.Println(u) // fmt.Println(u)
err = profile.SaveUser(u) err = profile.SaveUser(u)
if err != nil { if err != nil {
die("Error saving tweet: " + err.Error(), false, 4) die("Error saving user: " + err.Error(), false, 4)
}
err = profile.DownloadUserContentFor(&u)
if err != nil {
die("Error getting user content: " + err.Error(), false, 10)
} }
} }
for _, t := range tweets { for _, t := range tweets {
fmt.Println(t)
err = profile.SaveTweet(t) err = profile.SaveTweet(t)
if err != nil { if err != nil {
die("Error saving tweet: " + err.Error(), false, 4) die("Error saving tweet: " + err.Error(), false, 4)
} }
err = profile.DownloadTweetContentFor(&t)
if err != nil {
die("Error getting tweet content: " + err.Error(), false, 11)
}
} }
fmt.Printf("Saved %d tweets and %d users. Exiting successfully\n", len(tweets), len(users)) fmt.Printf("Saved %d tweets and %d users. Exiting successfully\n", len(tweets), len(users))
} }
@ -193,7 +200,11 @@ func fetch_user_feed(handle string, how_many int) {
for _, u := range users { for _, u := range users {
err = profile.SaveUser(u) err = profile.SaveUser(u)
if err != nil { if err != nil {
die("Error saving tweet: " + err.Error(), false, 4) die("Error saving user: " + err.Error(), false, 4)
}
err = profile.DownloadUserContentFor(&u)
if err != nil {
die("Error getting user content: " + err.Error(), false, 10)
} }
} }
@ -202,6 +213,10 @@ func fetch_user_feed(handle string, how_many int) {
if err != nil { if err != nil {
die("Error saving tweet: " + err.Error(), false, 4) die("Error saving tweet: " + err.Error(), false, 4)
} }
err = profile.DownloadTweetContentFor(&t)
if err != nil {
die("Error getting tweet content: " + err.Error(), false, 11)
}
} }
for _, r := range retweets { for _, r := range retweets {

View File

@ -6,6 +6,7 @@ import (
"path" "path"
"net/http" "net/http"
"io/ioutil" "io/ioutil"
"strings"
"offline_twitter/scraper" "offline_twitter/scraper"
) )
@ -150,6 +151,10 @@ func (p Profile) DownloadUserContentWithInjector(u *scraper.User, downloader Med
outfile = path.Join(p.ProfileDir, "profile_images", u.BannerImageLocalPath) outfile = path.Join(p.ProfileDir, "profile_images", u.BannerImageLocalPath)
err = downloader.Curl(u.BannerImageUrl, outfile) err = downloader.Curl(u.BannerImageUrl, outfile)
if err != nil { if err != nil {
if strings.Contains(err.Error(), "404 Not Found") {
// Try adding "600x200". Not sure why this does this but sometimes it does.
err = downloader.Curl(u.BannerImageUrl + "/600x200", outfile)
}
return err return err
} }
} }