Add background scraping of the logged-in user's home timeline

2023-08-27 17:46:49 -03:00 · 2023-08-27 17:46:49 -03:00 · 8349ca4ae3
commit 8349ca4ae3
parent 09356ecc04
8 changed files with 91 additions and 18 deletions
--- a/.woodpecker/build.yml
+++ b/.woodpecker/build.yml
@ -20,8 +20,8 @@ pipeline:
    secrets:
      - offline_twatter_passwd
    when:  # At least one
-      path: pkg/scraper/**
-      branch: release-*
+      - path: pkg/scraper/**
+      - branch: release-*
    commands:
      - cd cmd
      - ./tests.sh
--- a/doc/TODO.txt
+++ b/doc/TODO.txt
@ -97,7 +97,7 @@ TODO const-select-fields
 TODO compound-query-structs
 - Right now the result structs for the compound queries (e.g., "Feed", "TweetDetail") are in the `persistence` package.  They don't go there probably

-TOOD: login-routes-tests
+TODO: login-routes-tests
 - Make the scraper.API object injectable somehow (get rid of singleton pattern) and add tests for login and change-session sequences
 - Also test profile.ListSessions()

@ -111,13 +111,15 @@ TODO: webserver-session-arg-active-user

 TODO: webserver-tombstones

-TODO: fetch-timeline
- HTMX polling element on base template
-
 TODO: progressive-web-app

 TODO: paste-twitter-urls-in-search-bar
 - pasting a link from twitter.com into the search bar should handle it properly
-	- tweet detail
-	- user profile
 	- space
+
+TODO: image-width-and-height
+- Images should have explicit "width" and "height" attributes.  This reduces Cumulative Layout Shift (CLS) while loading the page.
+- https://web.dev/optimize-cls/#images-without-dimensions
+
+TODO: quote-tweet-icon
+- show quote-tweets on a tweet
--- a/internal/webserver/handler_tweet_detail.go
+++ b/internal/webserver/handler_tweet_detail.go
@ -53,7 +53,7 @@ func (app *Application) TweetDetail(w http.ResponseWriter, r *http.Request) {

 	// Return whether the scrape succeeded (if false, we should 404)
 	try_scrape_tweet := func() bool {
-		if app.DisableScraping {
+		if app.IsScrapingDisabled {
 			return false
 		}
 		trove, err := scraper.GetTweetFullAPIV2(tweet_id, 50) // TODO: parameterizable
--- a/internal/webserver/server.go
+++ b/internal/webserver/server.go
@ -29,9 +29,9 @@ type Application struct {

 	Middlewares []Middleware

-	Profile         persistence.Profile
-	ActiveUser      scraper.User
-	DisableScraping bool
+	Profile            persistence.Profile
+	ActiveUser         scraper.User
+	IsScrapingDisabled bool
 }

 func NewApp(profile persistence.Profile) Application {
@ -41,8 +41,9 @@ func NewApp(profile persistence.Profile) Application {
 		InfoLog:   log.New(os.Stdout, "INFO\t", log.Ldate|log.Ltime),
 		ErrorLog:  log.New(os.Stderr, "ERROR\t", log.Ldate|log.Ltime|log.Lshortfile),

-		Profile:    profile,
-		ActiveUser: get_default_user(),
+		Profile:            profile,
+		ActiveUser:         get_default_user(),
+		IsScrapingDisabled: true, // Until an active user is set
 	}
 	ret.Middlewares = []Middleware{
 		secureHeaders,
@ -64,7 +65,7 @@ func (app *Application) SetActiveUser(handle scraper.UserHandle) error {
 	if handle == "no account" {
 		scraper.InitApi(scraper.NewGuestSession())
 		app.ActiveUser = get_default_user()
-		app.DisableScraping = true // API requests will fail b/c not logged in
+		app.IsScrapingDisabled = true // API requests will fail b/c not logged in
 	} else {
 		user, err := app.Profile.GetUserByHandle(handle)
 		if err != nil {
@ -72,7 +73,7 @@ func (app *Application) SetActiveUser(handle scraper.UserHandle) error {
 		}
 		scraper.InitApi(app.Profile.LoadSession(handle))
 		app.ActiveUser = user
-		app.DisableScraping = false
+		app.IsScrapingDisabled = false
 	}
 	return nil
 }
@ -139,6 +140,9 @@ func (app *Application) Run(address string) {
 	}

 	app.InfoLog.Printf("Starting server on %s", address)
+
+	app.start_background()
+
 	err := srv.ListenAndServe()
 	app.ErrorLog.Fatal(err)
 }
--- a/internal/webserver/server_test.go
+++ b/internal/webserver/server_test.go
@ -49,7 +49,7 @@ func selector(s string) cascadia.Sel {
 func do_request(req *http.Request) *http.Response {
 	recorder := httptest.NewRecorder()
 	app := webserver.NewApp(profile)
-	app.DisableScraping = true
+	app.IsScrapingDisabled = true
 	app.ServeHTTP(recorder, req)
 	return recorder.Result()
 }
--- a/internal/webserver/stopwatch.go
+++ b/internal/webserver/stopwatch.go
@ -0,0 +1,65 @@
+package webserver
+
+import (
+	"fmt"
+	"gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
+	"time"
+)
+
+var is_for_you_only = true // Do one initial scrape of the "for you" feed and then just regular feed after that
+
+func (app *Application) background_scrape() {
+	// Avoid crashing the thread if a scrape fails
+	defer func() {
+		if err := recover(); err != nil {
+			// TODO
+			fmt.Println("Panicked!")
+			fmt.Printf("%#v\n", err)
+		}
+	}()
+
+	fmt.Println("Starting scrape...")
+
+	// Do nothing if scraping is currently disabled
+	if app.IsScrapingDisabled {
+		fmt.Println("Skipping scrape!")
+		return
+	}
+
+	fmt.Println("Scraping...")
+	trove, err := scraper.GetHomeTimeline("", is_for_you_only)
+	if err != nil {
+		app.ErrorLog.Printf("Background scrape failed: %s", err.Error())
+		return
+	}
+	fmt.Println("Saving scrape results...")
+	app.Profile.SaveTweetTrove(trove)
+	fmt.Println("Scraping succeeded.")
+	is_for_you_only = false
+}
+
+func (app *Application) start_background() {
+	// Start a goroutine to run the background task every 3 minutes
+	fmt.Println("Starting background")
+	go func() {
+		fmt.Println("Starting routine")
+
+		// Initial delay before the first task execution (0 seconds here, adjust as needed)
+		initialDelay := 10 * time.Second
+		time.Sleep(initialDelay)
+
+		app.background_scrape()
+
+		// Create a timer that triggers the background task every 3 minutes
+		interval := 3 * time.Minute // TODO: parameterizable
+		timer := time.NewTicker(interval)
+		defer timer.Stop()
+
+		for range timer.C {
+			// Execute the background task
+			fmt.Println("Starting routine")
+
+			app.background_scrape()
+		}
+	}()
+}
--- a/pkg/scraper/api_request_utils.go
+++ b/pkg/scraper/api_request_utils.go
@ -234,6 +234,8 @@ func (api *API) do_http_POST(url string, body string, result interface{}) error
 	if err != nil {
 		return fmt.Errorf("Error executing HTTP POST request:\n  %w", err)
 	}
+	api.update_csrf_token()
+
 	defer resp.Body.Close()

 	if resp.StatusCode != 200 {
--- a/pkg/scraper/api_types_v2.go
+++ b/pkg/scraper/api_types_v2.go
@ -954,7 +954,7 @@ func GetUserLikes(user_id UserID, cursor string) (TweetTrove, error) {
 	return the_api.GetUserLikes(user_id, cursor)
 }

-func (api API) GetHomeTimeline(cursor string, is_for_you bool) (TweetTrove, error) {
+func (api *API) GetHomeTimeline(cursor string, is_for_you bool) (TweetTrove, error) {
 	var url string
 	body_struct := struct {
 		Variables GraphqlVariables `json:"variables"`