Add background scraping of the logged-in user's home timeline

This commit is contained in:
Alessio 2023-08-27 17:46:49 -03:00
parent 09356ecc04
commit 8349ca4ae3
8 changed files with 91 additions and 18 deletions

View File

@ -20,8 +20,8 @@ pipeline:
secrets:
- offline_twatter_passwd
when: # At least one
path: pkg/scraper/**
branch: release-*
- path: pkg/scraper/**
- branch: release-*
commands:
- cd cmd
- ./tests.sh

View File

@ -97,7 +97,7 @@ TODO const-select-fields
TODO compound-query-structs
- Right now the result structs for the compound queries (e.g., "Feed", "TweetDetail") are in the `persistence` package. They don't go there probably
TOOD: login-routes-tests
TODO: login-routes-tests
- Make the scraper.API object injectable somehow (get rid of singleton pattern) and add tests for login and change-session sequences
- Also test profile.ListSessions()
@ -111,13 +111,15 @@ TODO: webserver-session-arg-active-user
TODO: webserver-tombstones
TODO: fetch-timeline
- HTMX polling element on base template
TODO: progressive-web-app
TODO: paste-twitter-urls-in-search-bar
- pasting a link from twitter.com into the search bar should handle it properly
- tweet detail
- user profile
- space
TODO: image-width-and-height
- Images should have explicit "width" and "height" attributes. This reduces Cumulative Layout Shift (CLS) while loading the page.
- https://web.dev/optimize-cls/#images-without-dimensions
TODO: quote-tweet-icon
- show quote-tweets on a tweet

View File

@ -53,7 +53,7 @@ func (app *Application) TweetDetail(w http.ResponseWriter, r *http.Request) {
// Return whether the scrape succeeded (if false, we should 404)
try_scrape_tweet := func() bool {
if app.DisableScraping {
if app.IsScrapingDisabled {
return false
}
trove, err := scraper.GetTweetFullAPIV2(tweet_id, 50) // TODO: parameterizable

View File

@ -29,9 +29,9 @@ type Application struct {
Middlewares []Middleware
Profile persistence.Profile
ActiveUser scraper.User
DisableScraping bool
Profile persistence.Profile
ActiveUser scraper.User
IsScrapingDisabled bool
}
func NewApp(profile persistence.Profile) Application {
@ -41,8 +41,9 @@ func NewApp(profile persistence.Profile) Application {
InfoLog: log.New(os.Stdout, "INFO\t", log.Ldate|log.Ltime),
ErrorLog: log.New(os.Stderr, "ERROR\t", log.Ldate|log.Ltime|log.Lshortfile),
Profile: profile,
ActiveUser: get_default_user(),
Profile: profile,
ActiveUser: get_default_user(),
IsScrapingDisabled: true, // Until an active user is set
}
ret.Middlewares = []Middleware{
secureHeaders,
@ -64,7 +65,7 @@ func (app *Application) SetActiveUser(handle scraper.UserHandle) error {
if handle == "no account" {
scraper.InitApi(scraper.NewGuestSession())
app.ActiveUser = get_default_user()
app.DisableScraping = true // API requests will fail b/c not logged in
app.IsScrapingDisabled = true // API requests will fail b/c not logged in
} else {
user, err := app.Profile.GetUserByHandle(handle)
if err != nil {
@ -72,7 +73,7 @@ func (app *Application) SetActiveUser(handle scraper.UserHandle) error {
}
scraper.InitApi(app.Profile.LoadSession(handle))
app.ActiveUser = user
app.DisableScraping = false
app.IsScrapingDisabled = false
}
return nil
}
@ -139,6 +140,9 @@ func (app *Application) Run(address string) {
}
app.InfoLog.Printf("Starting server on %s", address)
app.start_background()
err := srv.ListenAndServe()
app.ErrorLog.Fatal(err)
}

View File

@ -49,7 +49,7 @@ func selector(s string) cascadia.Sel {
func do_request(req *http.Request) *http.Response {
recorder := httptest.NewRecorder()
app := webserver.NewApp(profile)
app.DisableScraping = true
app.IsScrapingDisabled = true
app.ServeHTTP(recorder, req)
return recorder.Result()
}

View File

@ -0,0 +1,65 @@
package webserver
import (
"fmt"
"gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
"time"
)
var is_for_you_only = true // Do one initial scrape of the "for you" feed and then just regular feed after that
func (app *Application) background_scrape() {
// Avoid crashing the thread if a scrape fails
defer func() {
if err := recover(); err != nil {
// TODO
fmt.Println("Panicked!")
fmt.Printf("%#v\n", err)
}
}()
fmt.Println("Starting scrape...")
// Do nothing if scraping is currently disabled
if app.IsScrapingDisabled {
fmt.Println("Skipping scrape!")
return
}
fmt.Println("Scraping...")
trove, err := scraper.GetHomeTimeline("", is_for_you_only)
if err != nil {
app.ErrorLog.Printf("Background scrape failed: %s", err.Error())
return
}
fmt.Println("Saving scrape results...")
app.Profile.SaveTweetTrove(trove)
fmt.Println("Scraping succeeded.")
is_for_you_only = false
}
func (app *Application) start_background() {
// Start a goroutine to run the background task every 3 minutes
fmt.Println("Starting background")
go func() {
fmt.Println("Starting routine")
// Initial delay before the first task execution (0 seconds here, adjust as needed)
initialDelay := 10 * time.Second
time.Sleep(initialDelay)
app.background_scrape()
// Create a timer that triggers the background task every 3 minutes
interval := 3 * time.Minute // TODO: parameterizable
timer := time.NewTicker(interval)
defer timer.Stop()
for range timer.C {
// Execute the background task
fmt.Println("Starting routine")
app.background_scrape()
}
}()
}

View File

@ -234,6 +234,8 @@ func (api *API) do_http_POST(url string, body string, result interface{}) error
if err != nil {
return fmt.Errorf("Error executing HTTP POST request:\n %w", err)
}
api.update_csrf_token()
defer resp.Body.Close()
if resp.StatusCode != 200 {

View File

@ -954,7 +954,7 @@ func GetUserLikes(user_id UserID, cursor string) (TweetTrove, error) {
return the_api.GetUserLikes(user_id, cursor)
}
func (api API) GetHomeTimeline(cursor string, is_for_you bool) (TweetTrove, error) {
func (api *API) GetHomeTimeline(cursor string, is_for_you bool) (TweetTrove, error) {
var url string
body_struct := struct {
Variables GraphqlVariables `json:"variables"`