Expand debug logging to make debugging scraping errors easier

- Print HTTP headers to the debug log on every request
- If parsing a APIV2 tweet panics, print the full JSON of that tweet entry to the debug log
This commit is contained in:
Alessio 2023-06-07 14:46:46 -03:00
parent cba6631a72
commit b37977145a
3 changed files with 45 additions and 9 deletions

View File

@ -268,6 +268,9 @@ func (api *API) do_http(url string, cursor string, result interface{}) error {
return fmt.Errorf("Error executing HTTP request:\n %w", err)
}
defer resp.Body.Close()
for header := range req.Header {
log.Debug(fmt.Sprintf(" %s: %s\n", header, req.Header.Get(header)))
}
if resp.StatusCode != 200 && resp.StatusCode != 403 {
content, err := io.ReadAll(resp.Body)

View File

@ -1,6 +1,7 @@
package scraper
import (
"encoding/json"
"fmt"
"net/url"
"strings"
@ -313,14 +314,22 @@ func (api_v2_tweet APIV2Tweet) ToTweetTrove() TweetTrove {
return ret
}
type ItemContent struct {
EntryType string `json:"entryType"` // TODO: Assert this is always empty; looks like a typo
ItemType string `json:"itemType"`
TweetResults APIV2Result `json:"tweet_results"`
}
// Wraps InnerAPIV2Entry to implement `json.Unmarshal`. Does the normal unmarshal but also saves the original JSON.
type APIV2Entry struct {
InnerAPIV2Entry
OriginalJSON string
}
type InnerAPIV2Entry struct {
EntryID string `json:"entryId"`
SortIndex int64 `json:"sortIndex,string"`
Content struct {
ItemContent struct {
EntryType string `json:"entryType"`
TweetResults APIV2Result `json:"tweet_results"`
} `json:"itemContent"`
ItemContent ItemContent `json:"itemContent"`
// Cursors
EntryType string `json:"entryType"`
@ -329,6 +338,25 @@ type APIV2Entry struct {
} `json:"content"`
}
func (e *APIV2Entry) UnmarshalJSON(data []byte) error {
err := json.Unmarshal(data, &e.InnerAPIV2Entry)
if err != nil {
return fmt.Errorf("Error parsing json APIV2Entry:\n %w", err)
}
e.OriginalJSON = string(data)
return nil
}
func (e APIV2Entry) ToTweetTrove(ignore_null_entries bool) TweetTrove {
defer func() {
if obj := recover(); obj != nil {
log.Warn(fmt.Sprintf("Panic while decoding entry: %s\n", e.OriginalJSON))
panic(obj)
}
}()
return e.Content.ItemContent.TweetResults.ToTweetTrove(ignore_null_entries)
}
type APIV2Instruction struct {
Type string `json:"type"`
Entries []APIV2Entry `json:"entries"`
@ -390,13 +418,11 @@ func (api_response APIV2Response) ToTweetTrove() (TweetTrove, error) {
continue
}
result := entry.Content.ItemContent.TweetResults
main_trove := result.ToTweetTrove(true)
main_trove := entry.ToTweetTrove(true)
ret.MergeWith(main_trove)
}
return ret, nil
return ret, nil // TODO: This doesn't need to return an error, it's always nil
}
func get_graphql_user_timeline_url(user_id UserID, cursor string) string {

View File

@ -618,8 +618,15 @@ func TestAPIV2GetMainInstructionFromFeed(t *testing.T) {
assert.Equal(len(feed.GetMainInstruction().Entries), 41)
// Check that they have OriginalJSON filled out
for _, entry := range feed.GetMainInstruction().Entries {
assert.True(len(entry.OriginalJSON) > 0)
}
// Test that this is a writable version
feed.GetMainInstruction().Entries = append(feed.GetMainInstruction().Entries, APIV2Entry{EntryID: "asdf"})
feed.GetMainInstruction().Entries = append(feed.GetMainInstruction().Entries, APIV2Entry{
InnerAPIV2Entry: InnerAPIV2Entry{EntryID: "asdf"},
})
assert.Equal(len(feed.GetMainInstruction().Entries), 42)
assert.Equal(feed.GetMainInstruction().Entries[41].EntryID, "asdf")
}