Add background scraping of the logged-in user's home timeline

This commit is contained in:
Alessio 2023-08-27 17:46:49 -03:00
parent 09356ecc04
commit 8349ca4ae3
8 changed files with 91 additions and 18 deletions

View File

@ -20,8 +20,8 @@ pipeline:
secrets: secrets:
- offline_twatter_passwd - offline_twatter_passwd
when: # At least one when: # At least one
path: pkg/scraper/** - path: pkg/scraper/**
branch: release-* - branch: release-*
commands: commands:
- cd cmd - cd cmd
- ./tests.sh - ./tests.sh

View File

@ -97,7 +97,7 @@ TODO const-select-fields
TODO compound-query-structs TODO compound-query-structs
- Right now the result structs for the compound queries (e.g., "Feed", "TweetDetail") are in the `persistence` package. They don't go there probably - Right now the result structs for the compound queries (e.g., "Feed", "TweetDetail") are in the `persistence` package. They don't go there probably
TOOD: login-routes-tests TODO: login-routes-tests
- Make the scraper.API object injectable somehow (get rid of singleton pattern) and add tests for login and change-session sequences - Make the scraper.API object injectable somehow (get rid of singleton pattern) and add tests for login and change-session sequences
- Also test profile.ListSessions() - Also test profile.ListSessions()
@ -111,13 +111,15 @@ TODO: webserver-session-arg-active-user
TODO: webserver-tombstones TODO: webserver-tombstones
TODO: fetch-timeline
- HTMX polling element on base template
TODO: progressive-web-app TODO: progressive-web-app
TODO: paste-twitter-urls-in-search-bar TODO: paste-twitter-urls-in-search-bar
- pasting a link from twitter.com into the search bar should handle it properly - pasting a link from twitter.com into the search bar should handle it properly
- tweet detail
- user profile
- space - space
TODO: image-width-and-height
- Images should have explicit "width" and "height" attributes. This reduces Cumulative Layout Shift (CLS) while loading the page.
- https://web.dev/optimize-cls/#images-without-dimensions
TODO: quote-tweet-icon
- show quote-tweets on a tweet

View File

@ -53,7 +53,7 @@ func (app *Application) TweetDetail(w http.ResponseWriter, r *http.Request) {
// Return whether the scrape succeeded (if false, we should 404) // Return whether the scrape succeeded (if false, we should 404)
try_scrape_tweet := func() bool { try_scrape_tweet := func() bool {
if app.DisableScraping { if app.IsScrapingDisabled {
return false return false
} }
trove, err := scraper.GetTweetFullAPIV2(tweet_id, 50) // TODO: parameterizable trove, err := scraper.GetTweetFullAPIV2(tweet_id, 50) // TODO: parameterizable

View File

@ -29,9 +29,9 @@ type Application struct {
Middlewares []Middleware Middlewares []Middleware
Profile persistence.Profile Profile persistence.Profile
ActiveUser scraper.User ActiveUser scraper.User
DisableScraping bool IsScrapingDisabled bool
} }
func NewApp(profile persistence.Profile) Application { func NewApp(profile persistence.Profile) Application {
@ -41,8 +41,9 @@ func NewApp(profile persistence.Profile) Application {
InfoLog: log.New(os.Stdout, "INFO\t", log.Ldate|log.Ltime), InfoLog: log.New(os.Stdout, "INFO\t", log.Ldate|log.Ltime),
ErrorLog: log.New(os.Stderr, "ERROR\t", log.Ldate|log.Ltime|log.Lshortfile), ErrorLog: log.New(os.Stderr, "ERROR\t", log.Ldate|log.Ltime|log.Lshortfile),
Profile: profile, Profile: profile,
ActiveUser: get_default_user(), ActiveUser: get_default_user(),
IsScrapingDisabled: true, // Until an active user is set
} }
ret.Middlewares = []Middleware{ ret.Middlewares = []Middleware{
secureHeaders, secureHeaders,
@ -64,7 +65,7 @@ func (app *Application) SetActiveUser(handle scraper.UserHandle) error {
if handle == "no account" { if handle == "no account" {
scraper.InitApi(scraper.NewGuestSession()) scraper.InitApi(scraper.NewGuestSession())
app.ActiveUser = get_default_user() app.ActiveUser = get_default_user()
app.DisableScraping = true // API requests will fail b/c not logged in app.IsScrapingDisabled = true // API requests will fail b/c not logged in
} else { } else {
user, err := app.Profile.GetUserByHandle(handle) user, err := app.Profile.GetUserByHandle(handle)
if err != nil { if err != nil {
@ -72,7 +73,7 @@ func (app *Application) SetActiveUser(handle scraper.UserHandle) error {
} }
scraper.InitApi(app.Profile.LoadSession(handle)) scraper.InitApi(app.Profile.LoadSession(handle))
app.ActiveUser = user app.ActiveUser = user
app.DisableScraping = false app.IsScrapingDisabled = false
} }
return nil return nil
} }
@ -139,6 +140,9 @@ func (app *Application) Run(address string) {
} }
app.InfoLog.Printf("Starting server on %s", address) app.InfoLog.Printf("Starting server on %s", address)
app.start_background()
err := srv.ListenAndServe() err := srv.ListenAndServe()
app.ErrorLog.Fatal(err) app.ErrorLog.Fatal(err)
} }

View File

@ -49,7 +49,7 @@ func selector(s string) cascadia.Sel {
func do_request(req *http.Request) *http.Response { func do_request(req *http.Request) *http.Response {
recorder := httptest.NewRecorder() recorder := httptest.NewRecorder()
app := webserver.NewApp(profile) app := webserver.NewApp(profile)
app.DisableScraping = true app.IsScrapingDisabled = true
app.ServeHTTP(recorder, req) app.ServeHTTP(recorder, req)
return recorder.Result() return recorder.Result()
} }

View File

@ -0,0 +1,65 @@
package webserver
import (
"fmt"
"gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
"time"
)
var is_for_you_only = true // Do one initial scrape of the "for you" feed and then just regular feed after that
func (app *Application) background_scrape() {
// Avoid crashing the thread if a scrape fails
defer func() {
if err := recover(); err != nil {
// TODO
fmt.Println("Panicked!")
fmt.Printf("%#v\n", err)
}
}()
fmt.Println("Starting scrape...")
// Do nothing if scraping is currently disabled
if app.IsScrapingDisabled {
fmt.Println("Skipping scrape!")
return
}
fmt.Println("Scraping...")
trove, err := scraper.GetHomeTimeline("", is_for_you_only)
if err != nil {
app.ErrorLog.Printf("Background scrape failed: %s", err.Error())
return
}
fmt.Println("Saving scrape results...")
app.Profile.SaveTweetTrove(trove)
fmt.Println("Scraping succeeded.")
is_for_you_only = false
}
func (app *Application) start_background() {
// Start a goroutine to run the background task every 3 minutes
fmt.Println("Starting background")
go func() {
fmt.Println("Starting routine")
// Initial delay before the first task execution (0 seconds here, adjust as needed)
initialDelay := 10 * time.Second
time.Sleep(initialDelay)
app.background_scrape()
// Create a timer that triggers the background task every 3 minutes
interval := 3 * time.Minute // TODO: parameterizable
timer := time.NewTicker(interval)
defer timer.Stop()
for range timer.C {
// Execute the background task
fmt.Println("Starting routine")
app.background_scrape()
}
}()
}

View File

@ -234,6 +234,8 @@ func (api *API) do_http_POST(url string, body string, result interface{}) error
if err != nil { if err != nil {
return fmt.Errorf("Error executing HTTP POST request:\n %w", err) return fmt.Errorf("Error executing HTTP POST request:\n %w", err)
} }
api.update_csrf_token()
defer resp.Body.Close() defer resp.Body.Close()
if resp.StatusCode != 200 { if resp.StatusCode != 200 {

View File

@ -954,7 +954,7 @@ func GetUserLikes(user_id UserID, cursor string) (TweetTrove, error) {
return the_api.GetUserLikes(user_id, cursor) return the_api.GetUserLikes(user_id, cursor)
} }
func (api API) GetHomeTimeline(cursor string, is_for_you bool) (TweetTrove, error) { func (api *API) GetHomeTimeline(cursor string, is_for_you bool) (TweetTrove, error) {
var url string var url string
body_struct := struct { body_struct := struct {
Variables GraphqlVariables `json:"variables"` Variables GraphqlVariables `json:"variables"`