Recognized deleted users when scraping, create a user with a fake ID

This commit is contained in:
Alessio 2022-02-27 23:05:37 -08:00
parent 728aaf251f
commit 52370a4f9d
6 changed files with 58 additions and 10 deletions

View File

@ -197,10 +197,11 @@ test $(find link_preview_images | wc -l) = $initial_link_preview_images_count #
# Test a tweet thread with tombstones
tw fetch_tweet https://twitter.com/CovfefeAnon/status/1454526270809726977
test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454515503242829830") = 1
test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454521424144654344") = 0
test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454522147750260742") = 1
# tw fetch_tweet https://twitter.com/CovfefeAnon/status/1454526270809726977
# test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454515503242829830") = 1
# test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454521424144654344") = 0 # TODO this guy got banned
# test $(sqlite3 twitter.db "select is_stub from tweets where id = 1454522147750260742") = 1
# Test search
tw search "from:michaelmalice constitution"

View File

@ -244,6 +244,7 @@ type APIUser struct {
StatusesCount int `json:"statuses_count"`
Verified bool `json:"verified"`
IsBanned bool
DoesntExist bool
}
@ -256,6 +257,7 @@ type UserResponse struct {
} `json:"data"`
Errors []struct {
Message string `json:"message"`
Name string `json:"name"`
Code int `json:"code"`
} `json:"errors"`
}
@ -267,6 +269,8 @@ func (u UserResponse) ConvertToAPIUser() APIUser {
for _, api_error := range u.Errors {
if api_error.Message == "Authorization: User has been suspended. (63)" {
ret.IsBanned = true
} else if api_error.Name == "NotFoundError" {
ret.DoesntExist = true
} else {
panic(fmt.Sprintf("Unknown api error: %q", api_error.Message))
}

View File

@ -0,0 +1 @@
{"errors":[{"message":"User 'GregCunningham0' not found","locations":[{"line":126,"column":3}],"path":["user"],"extensions":{"name":"NotFoundError","source":"Server","code":50,"kind":"NonFatal","tracing":{"trace_id":"2c6f690015d9f18e"}},"code":50,"kind":"NonFatal","name":"NotFoundError","source":"Server","tracing":{"trace_id":"2c6f690015d9f18e"}}],"data":{}}

View File

@ -87,7 +87,14 @@ func (trove *TweetTrove) FetchTombstoneUsers() {
}
if user.ID == 0 {
panic(fmt.Sprintf("UserID == 0 (@%s)", handle))
// Find some random ID to fit it into the trove
for i := 1; ; i++ {
_, ok := trove.Users[UserID(i)]
if !ok {
user.ID = UserID(i)
break
}
}
}
trove.Users[user.ID] = user
@ -108,7 +115,7 @@ func (trove *TweetTrove) FetchTombstoneUsers() {
func (trove *TweetTrove) FillMissingUserIDs() {
for i := range trove.Tweets {
tweet := trove.Tweets[i]
if tweet.UserID != 0 {
if tweet.UserHandle == "" {
// No need to fill this tweet's user_id, it's already filled
continue
}

View File

@ -115,10 +115,6 @@ func GetUnknownUserWithHandle(handle UserHandle) User {
JoinDate: time.Unix(0, 0),
IsVerified: false,
IsPrivate: true,
ProfileImageUrl: DEFAULT_PROFILE_IMAGE_URL,
ProfileImageLocalPath: path.Base(DEFAULT_PROFILE_IMAGE_URL),
BannerImageUrl: "",
BannerImageLocalPath: "",
IsNeedingFakeID: true,
IsIdFake: true,
}
@ -126,6 +122,14 @@ func GetUnknownUserWithHandle(handle UserHandle) User {
// Turn an APIUser, as returned from the scraper, into a properly structured User object
func ParseSingleUser(apiUser APIUser) (ret User, err error) {
if apiUser.DoesntExist {
// User may have been deleted, or there was a typo. There's no data to parse
if apiUser.ScreenName == "" {
panic("ScreenName is empty!")
}
ret = GetUnknownUserWithHandle(UserHandle(apiUser.ScreenName))
return
}
ret.ID = UserID(apiUser.ID)
ret.Handle = UserHandle(apiUser.ScreenName)
if apiUser.IsBanned {

View File

@ -73,6 +73,37 @@ func TestParseBannedUser(t *testing.T) {
assert.Equal("default_profile.png", user.GetTinyProfileImageLocalPath())
}
/**
* Should correctly parse a deleted user
*/
func TestParseDeletedUser(t *testing.T) {
assert := assert.New(t)
data, err := ioutil.ReadFile("test_responses/deleted_user.json")
if err != nil {
panic(err)
}
var user_resp UserResponse
err = json.Unmarshal(data, &user_resp)
require.NoError(t, err)
handle := "Some Random Deleted User"
apiUser := user_resp.ConvertToAPIUser()
apiUser.ScreenName = string(handle) // This is done in scraper.GetUser, since users are retrieved by handle anyway
user, err := ParseSingleUser(apiUser)
require.NoError(t, err)
assert.Equal(UserID(0), user.ID)
assert.True(user.IsIdFake)
assert.True(user.IsNeedingFakeID)
assert.Equal(user.Bio, "<blank>")
assert.Equal(user.Handle, UserHandle(handle))
// Test generation of profile images for deleted user
assert.Equal("https://abs.twimg.com/sticky/default_profile_images/default_profile.png", user.GetTinyProfileImageUrl())
assert.Equal("default_profile.png", user.GetTinyProfileImageLocalPath())
}
/**
* Should extract a user handle from a tweet URL, or fail if URL is invalid
*/