From b37977145a76e166cdc1c67fe1d758e5c9693b60 Mon Sep 17 00:00:00 2001 From: Alessio Date: Wed, 7 Jun 2023 14:46:46 -0300 Subject: [PATCH] Expand debug logging to make debugging scraping errors easier - Print HTTP headers to the debug log on every request - If parsing a APIV2 tweet panics, print the full JSON of that tweet entry to the debug log --- scraper/api_request_utils.go | 3 +++ scraper/api_types_v2.go | 42 +++++++++++++++++++++++++++++------- scraper/api_types_v2_test.go | 9 +++++++- 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/scraper/api_request_utils.go b/scraper/api_request_utils.go index afae5fa..feb7c53 100644 --- a/scraper/api_request_utils.go +++ b/scraper/api_request_utils.go @@ -268,6 +268,9 @@ func (api *API) do_http(url string, cursor string, result interface{}) error { return fmt.Errorf("Error executing HTTP request:\n %w", err) } defer resp.Body.Close() + for header := range req.Header { + log.Debug(fmt.Sprintf(" %s: %s\n", header, req.Header.Get(header))) + } if resp.StatusCode != 200 && resp.StatusCode != 403 { content, err := io.ReadAll(resp.Body) diff --git a/scraper/api_types_v2.go b/scraper/api_types_v2.go index 835d111..d9618bb 100644 --- a/scraper/api_types_v2.go +++ b/scraper/api_types_v2.go @@ -1,6 +1,7 @@ package scraper import ( + "encoding/json" "fmt" "net/url" "strings" @@ -313,14 +314,22 @@ func (api_v2_tweet APIV2Tweet) ToTweetTrove() TweetTrove { return ret } +type ItemContent struct { + EntryType string `json:"entryType"` // TODO: Assert this is always empty; looks like a typo + ItemType string `json:"itemType"` + TweetResults APIV2Result `json:"tweet_results"` +} + +// Wraps InnerAPIV2Entry to implement `json.Unmarshal`. Does the normal unmarshal but also saves the original JSON. type APIV2Entry struct { + InnerAPIV2Entry + OriginalJSON string +} +type InnerAPIV2Entry struct { EntryID string `json:"entryId"` SortIndex int64 `json:"sortIndex,string"` Content struct { - ItemContent struct { - EntryType string `json:"entryType"` - TweetResults APIV2Result `json:"tweet_results"` - } `json:"itemContent"` + ItemContent ItemContent `json:"itemContent"` // Cursors EntryType string `json:"entryType"` @@ -329,6 +338,25 @@ type APIV2Entry struct { } `json:"content"` } +func (e *APIV2Entry) UnmarshalJSON(data []byte) error { + err := json.Unmarshal(data, &e.InnerAPIV2Entry) + if err != nil { + return fmt.Errorf("Error parsing json APIV2Entry:\n %w", err) + } + e.OriginalJSON = string(data) + return nil +} + +func (e APIV2Entry) ToTweetTrove(ignore_null_entries bool) TweetTrove { + defer func() { + if obj := recover(); obj != nil { + log.Warn(fmt.Sprintf("Panic while decoding entry: %s\n", e.OriginalJSON)) + panic(obj) + } + }() + return e.Content.ItemContent.TweetResults.ToTweetTrove(ignore_null_entries) +} + type APIV2Instruction struct { Type string `json:"type"` Entries []APIV2Entry `json:"entries"` @@ -390,13 +418,11 @@ func (api_response APIV2Response) ToTweetTrove() (TweetTrove, error) { continue } - result := entry.Content.ItemContent.TweetResults - - main_trove := result.ToTweetTrove(true) + main_trove := entry.ToTweetTrove(true) ret.MergeWith(main_trove) } - return ret, nil + return ret, nil // TODO: This doesn't need to return an error, it's always nil } func get_graphql_user_timeline_url(user_id UserID, cursor string) string { diff --git a/scraper/api_types_v2_test.go b/scraper/api_types_v2_test.go index ebeec9d..c00832e 100644 --- a/scraper/api_types_v2_test.go +++ b/scraper/api_types_v2_test.go @@ -618,8 +618,15 @@ func TestAPIV2GetMainInstructionFromFeed(t *testing.T) { assert.Equal(len(feed.GetMainInstruction().Entries), 41) + // Check that they have OriginalJSON filled out + for _, entry := range feed.GetMainInstruction().Entries { + assert.True(len(entry.OriginalJSON) > 0) + } + // Test that this is a writable version - feed.GetMainInstruction().Entries = append(feed.GetMainInstruction().Entries, APIV2Entry{EntryID: "asdf"}) + feed.GetMainInstruction().Entries = append(feed.GetMainInstruction().Entries, APIV2Entry{ + InnerAPIV2Entry: InnerAPIV2Entry{EntryID: "asdf"}, + }) assert.Equal(len(feed.GetMainInstruction().Entries), 42) assert.Equal(feed.GetMainInstruction().Entries[41].EntryID, "asdf") }