Add scraping notiication detail

This commit is contained in:
Alessio 2024-08-28 19:22:09 -07:00
parent dc816c6f28
commit 0c620621a6
9 changed files with 187 additions and 14 deletions

View File

@ -603,6 +603,12 @@ func get_notifications(how_many int) {
if err != nil && !errors.Is(err, scraper.END_OF_FEED) { if err != nil && !errors.Is(err, scraper.END_OF_FEED) {
panic(err) panic(err)
} }
to_scrape := profile.CheckNotificationScrapesNeeded(trove)
trove, err = api.GetNotificationDetailForAll(trove, to_scrape)
if err != nil {
panic(err)
}
profile.SaveTweetTrove(trove, true, &api) profile.SaveTweetTrove(trove, true, &api)
happy_exit(fmt.Sprintf("Saved %d notifications, %d tweets and %d users", happy_exit(fmt.Sprintf("Saved %d notifications, %d tweets and %d users",
len(trove.Notifications), len(trove.Tweets), len(trove.Users), len(trove.Notifications), len(trove.Tweets), len(trove.Users),

View File

@ -1,6 +1,9 @@
package persistence package persistence
import ( import (
"database/sql"
"errors"
"fmt"
. "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" . "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
) )
@ -12,16 +15,20 @@ func (p Profile) SaveNotification(n Notification) {
// Save the Notification // Save the Notification
_, err = tx.NamedExec(` _, err = tx.NamedExec(`
insert into notifications(id, type, sent_at, sort_index, user_id, action_user_id, action_tweet_id, action_retweet_id) insert into notifications(id, type, sent_at, sort_index, user_id, action_user_id, action_tweet_id, action_retweet_id,
has_detail, last_scraped_at)
values (:id, :type, :sent_at, :sort_index, :user_id, nullif(:action_user_id, 0), nullif(:action_tweet_id, 0), values (:id, :type, :sent_at, :sort_index, :user_id, nullif(:action_user_id, 0), nullif(:action_tweet_id, 0),
nullif(:action_retweet_id, 0)) nullif(:action_retweet_id, 0), :has_detail, :last_scraped_at)
on conflict do update on conflict do update
set sent_at = max(sent_at, :sent_at), set sent_at = max(sent_at, :sent_at),
sort_index = max(sort_index, :sort_index), sort_index = max(sort_index, :sort_index),
action_user_id = nullif(:action_user_id, 0), action_user_id = nullif(:action_user_id, 0),
action_tweet_id = nullif(:action_tweet_id, 0) action_tweet_id = nullif(:action_tweet_id, 0),
has_detail = has_detail or :has_detail,
last_scraped_at = max(last_scraped_at, :last_scraped_at)
`, n) `, n)
if err != nil { if err != nil {
fmt.Printf("failed to save notification %#v\n", n)
panic(err) panic(err)
} }
@ -62,7 +69,7 @@ func (p Profile) GetNotification(id NotificationID) Notification {
var ret Notification var ret Notification
err := p.DB.Get(&ret, err := p.DB.Get(&ret,
`select id, type, sent_at, sort_index, user_id, ifnull(action_user_id, 0) action_user_id, `select id, type, sent_at, sort_index, user_id, ifnull(action_user_id, 0) action_user_id,
ifnull(action_tweet_id, 0) action_tweet_id, ifnull(action_retweet_id, 0) action_retweet_id ifnull(action_tweet_id, 0) action_tweet_id, ifnull(action_retweet_id, 0) action_retweet_id, has_detail, last_scraped_at
from notifications where id = ?`, from notifications where id = ?`,
id) id)
if err != nil { if err != nil {
@ -82,3 +89,29 @@ func (p Profile) GetNotification(id NotificationID) Notification {
} }
return ret return ret
} }
func (p Profile) CheckNotificationScrapesNeeded(trove TweetTrove) []NotificationID {
ret := []NotificationID{}
for n_id, notification := range trove.Notifications {
// If there's no detail page, skip
if !notification.HasDetail {
continue
}
// Check its last-scraped
var last_scraped_at Timestamp
err := p.DB.Get(&last_scraped_at, `select last_scraped_at from notifications where id = ?`, n_id)
if errors.Is(err, sql.ErrNoRows) {
// It's not scraped at all yet
ret = append(ret, n_id)
continue
} else if err != nil {
panic(err)
}
// If the latest scrape is not fresh (older than the notification sent-at time), add it
if last_scraped_at.Time.Before(notification.SentAt.Time) {
ret = append(ret, n_id)
}
}
return ret
}

View File

@ -414,6 +414,9 @@ create table notifications (rowid integer primary key,
action_tweet_id integer references tweets(id), -- tweet associated with the notification action_tweet_id integer references tweets(id), -- tweet associated with the notification
action_retweet_id integer references retweets(retweet_id), action_retweet_id integer references retweets(retweet_id),
has_detail boolean not null default 0,
last_scraped_at not null default 0,
foreign key(type) references notification_types(rowid) foreign key(type) references notification_types(rowid)
foreign key(user_id) references users(id) foreign key(user_id) references users(id)
); );

View File

@ -410,6 +410,8 @@ func create_dummy_notification() Notification {
ActionUserID: create_stable_user().ID, ActionUserID: create_stable_user().ID,
ActionTweetID: create_stable_tweet().ID, ActionTweetID: create_stable_tweet().ID,
ActionRetweetID: create_stable_retweet().RetweetID, ActionRetweetID: create_stable_retweet().RetweetID,
HasDetail: true,
LastScrapedAt: TimestampFromUnix(57234728),
TweetIDs: []TweetID{create_stable_tweet().ID}, TweetIDs: []TweetID{create_stable_tweet().ID},
UserIDs: []UserID{create_stable_user().ID}, UserIDs: []UserID{create_stable_user().ID},
RetweetIDs: []TweetID{create_stable_retweet().RetweetID}, RetweetIDs: []TweetID{create_stable_retweet().RetweetID},

View File

@ -333,6 +333,9 @@ var MIGRATIONS = []string{
action_tweet_id integer references tweets(id), -- tweet associated with the notification action_tweet_id integer references tweets(id), -- tweet associated with the notification
action_retweet_id integer references retweets(retweet_id), action_retweet_id integer references retweets(retweet_id),
has_detail boolean not null default 0,
last_scraped_at not null default 0,
foreign key(type) references notification_types(rowid) foreign key(type) references notification_types(rowid)
foreign key(user_id) references users(id) foreign key(user_id) references users(id)
); );

View File

@ -2,10 +2,12 @@ package scraper
import ( import (
"errors" "errors"
"fmt"
"net/url" "net/url"
"regexp" "regexp"
"sort" "sort"
"strings" "strings"
"time"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
@ -56,6 +58,41 @@ func (api *API) GetNotifications(how_many int) (TweetTrove, error) {
} }
trove.MergeWith(new_trove) trove.MergeWith(new_trove)
} }
return trove, nil
}
func (api *API) GetNotificationDetailForAll(trove TweetTrove, to_scrape []NotificationID) (TweetTrove, error) {
for _, n_id := range to_scrape {
notification := trove.Notifications[n_id]
resp, err := api.GetNotificationDetail(notification)
if errors.Is(err, ErrRateLimited) {
log.Warnf("Rate limited!")
break
} else if err != nil {
return TweetTrove{}, err
}
// Fetch the notification detail
new_trove, ids, err := resp.ToTweetTroveAsNotificationDetail()
if err != nil {
panic(err)
}
trove.MergeWith(new_trove)
// Add the fetched Tweet / Retweet IDs to the notification
for _, id := range ids {
_, is_retweet := trove.Retweets[id]
if is_retweet {
notification.RetweetIDs = append(notification.RetweetIDs, id)
} else {
notification.TweetIDs = append(notification.TweetIDs, id)
}
}
// Update the notification's last_scraped_at
notification.LastScrapedAt = Timestamp{time.Now()}
trove.Notifications[n_id] = notification
}
return trove, nil return trove, nil
} }
@ -91,6 +128,17 @@ func (t *TweetResponse) ToTweetTroveAsNotifications(current_user_id UserID) (Twe
notification.Type = NOTIFICATION_TYPE_QUOTE_TWEET notification.Type = NOTIFICATION_TYPE_QUOTE_TWEET
} else if strings.Contains(entry.Content.Item.ClientEventInfo.Element, "mentioned") { } else if strings.Contains(entry.Content.Item.ClientEventInfo.Element, "mentioned") {
notification.Type = NOTIFICATION_TYPE_MENTION notification.Type = NOTIFICATION_TYPE_MENTION
} else if strings.Contains(entry.Content.Item.ClientEventInfo.Element, "live_broadcast") {
// TODO: broadcast
notification.Type = NOTIFICATION_TYPE_USER_IS_LIVE
} else if strings.Contains(entry.Content.Item.ClientEventInfo.Element, "community_tweet_pinned") {
// TODO: communities
delete(ret.Notifications, notification.ID)
continue
}
if strings.Contains(entry.Content.Item.ClientEventInfo.Element, "multiple") {
notification.HasDetail = true
} }
if entry.Content.Item.Content.Tweet.ID != 0 { if entry.Content.Item.Content.Tweet.ID != 0 {
@ -161,3 +209,39 @@ func ParseSingleNotification(n APINotification) Notification {
return ret return ret
} }
func (api *API) GetNotificationDetail(n Notification) (TweetResponse, error) {
url, err := url.Parse(fmt.Sprintf("https://twitter.com/i/api/2/notifications/view/%s.json", n.ID))
if err != nil {
panic(err)
}
query := url.Query()
add_tweet_query_params(&query)
url.RawQuery = query.Encode()
var result TweetResponse
err = api.do_http(url.String(), "", &result)
return result, err
}
func (t *TweetResponse) ToTweetTroveAsNotificationDetail() (TweetTrove, []TweetID, error) {
ids := []TweetID{}
ret, err := t.ToTweetTrove()
if err != nil {
return TweetTrove{}, ids, err
}
// Find the "addEntries" instruction
for _, instr := range t.Timeline.Instructions {
sort.Sort(instr.AddEntries.Entries)
for _, entry := range instr.AddEntries.Entries {
if entry.Content.Item.Content.Tweet.ID != 0 {
ids = append(ids, TweetID(entry.Content.Item.Content.Tweet.ID))
}
}
}
return ret, ids, nil
}

View File

@ -119,6 +119,21 @@ func TestParseNotificationsPage(t *testing.T) {
assert.Len(notif10.RetweetIDs, 1) assert.Len(notif10.RetweetIDs, 1)
assert.Contains(notif10.RetweetIDs, TweetID(1827183097382654351)) assert.Contains(notif10.RetweetIDs, TweetID(1827183097382654351))
notif11, is_ok := tweet_trove.Notifications["FDzeDIfVUAIAAAABiJONco_yJRHyMqRjxDY"]
assert.True(is_ok)
assert.Equal(NOTIFICATION_TYPE_USER_IS_LIVE, notif11.Type)
assert.Equal(UserID(277536867), notif11.ActionUserID)
// 1 user liked multiple posts
notif12, is_ok := tweet_trove.Notifications["FDzeDIfVUAIAAAABiJONco_yJRESfwtSqvg"]
assert.True(is_ok)
assert.True(notif12.HasDetail)
// TODO: communities
// notif12, is_ok := tweet_trove.Notifications["FDzeDIfVUAIAAAABiJONco_yJRHPBNsDH88"]
// assert.True(is_ok)
// assert.Equal(NOTIFICATION_TYPE_COMMUNITY_PINNED_POST, notif12.Type)
// Check users // Check users
for _, u_id := range []UserID{1458284524761075714, 28815778, 1633158398555353096} { for _, u_id := range []UserID{1458284524761075714, 28815778, 1633158398555353096} {
_, is_ok := tweet_trove.Users[u_id] _, is_ok := tweet_trove.Users[u_id]
@ -155,3 +170,25 @@ func TestParseNotificationsEndOfFeed(t *testing.T) {
assert.True(resp.IsEndOfFeed()) assert.True(resp.IsEndOfFeed())
} }
func TestParseNotificationDetail(t *testing.T) {
assert := assert.New(t)
require := require.New(t)
data, err := os.ReadFile("test_responses/notifications/notification_detail.json")
require.NoError(err)
var resp TweetResponse
err = json.Unmarshal(data, &resp)
require.NoError(err)
trove, ids, err := resp.ToTweetTroveAsNotificationDetail()
require.NoError(err)
assert.Len(ids, 2)
assert.Contains(ids, TweetID(1827544032714633628))
assert.Contains(ids, TweetID(1826743131108487390))
_, is_ok := trove.Tweets[1826743131108487390]
assert.True(is_ok)
_, is_ok = trove.Retweets[1827544032714633628]
assert.True(is_ok)
}

View File

@ -52,6 +52,10 @@ type Notification struct {
ActionTweetID TweetID `db:"action_tweet_id"` ActionTweetID TweetID `db:"action_tweet_id"`
ActionRetweetID TweetID `db:"action_retweet_id"` ActionRetweetID TweetID `db:"action_retweet_id"`
// Used for "multiple" notifs, like "user liked multiple tweets"
HasDetail bool `db:"has_detail"`
LastScrapedAt Timestamp `db:"last_scraped_at"`
TweetIDs []TweetID TweetIDs []TweetID
UserIDs []UserID UserIDs []UserID
RetweetIDs []TweetID RetweetIDs []TweetID

File diff suppressed because one or more lines are too long