Add Url type

This commit is contained in:
Alessio 2021-09-17 13:41:43 -07:00
parent 06831dfb52
commit 3f9c807efa
4 changed files with 132 additions and 0 deletions

View File

@ -32,6 +32,35 @@ type APIExtendedMedia struct {
} `json:"video_info"` } `json:"video_info"`
} }
type APICard struct {
BindingValues struct {
Domain struct {
Value string `json:"string_value"`
} `json:"domain"`
Creator struct {
UserValue struct {
Value int64 `json:"id_str,string"`
} `json:"user_value"`
} `json:"creator"`
Site struct {
UserValue struct {
Value int64 `json:"id_str,string"`
} `json:"user_value"`
} `json:"site"`
Title struct {
Value string `json:"string_value"`
} `json:"title"`
Description struct {
Value string `json:"string_value"`
} `json:"description"`
Thumbnail struct {
ImageValue struct {
Url string `json:"url"`
} `json:"image_value"`
} `json:"thumbnail_image_large"`
} `json:"binding_values"`
}
type APITweet struct { type APITweet struct {
ID int64 `json:"id_str,string"` ID int64 `json:"id_str,string"`
ConversationID int64 `json:"conversation_id_str,string"` ConversationID int64 `json:"conversation_id_str,string"`
@ -66,6 +95,7 @@ type APITweet struct {
QuotedStatusID int64 QuotedStatusID int64
Time time.Time `json:"time"` Time time.Time `json:"time"`
UserID int64 `json:"user_id_str,string"` UserID int64 `json:"user_id_str,string"`
Card APICard `json:"card"`
} }
func (t *APITweet) NormalizeContent() { func (t *APITweet) NormalizeContent() {

File diff suppressed because one or more lines are too long

47
scraper/url.go Normal file
View File

@ -0,0 +1,47 @@
package scraper
import (
"fmt"
"path"
"net/url"
)
type Url struct {
Domain string
Text string
Title string
Description string
ThumbnailRemoteUrl string
ThumbnailLocalPath string
CreatorID UserID
SiteID UserID
IsContentDownloaded bool
}
func ParseAPIUrlCard(apiCard APICard) Url {
values := apiCard.BindingValues
return Url{
Domain: values.Domain.Value,
Title: values.Title.Value,
Description: values.Description.Value,
ThumbnailRemoteUrl: values.Thumbnail.ImageValue.Url,
ThumbnailLocalPath: get_thumbnail_local_path(values.Thumbnail.ImageValue.Url),
CreatorID: UserID(values.Creator.UserValue.Value),
SiteID: UserID(values.Site.UserValue.Value),
IsContentDownloaded: false,
}
}
func get_thumbnail_local_path(remote_url string) string {
u, err := url.Parse(remote_url)
if err != nil {
panic(err)
}
query_params, err := url.ParseQuery(u.RawQuery)
if err != nil {
panic(err)
}
return fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0])
}

54
scraper/url_test.go Normal file
View File

@ -0,0 +1,54 @@
package scraper_test
import (
"testing"
"io/ioutil"
"encoding/json"
"offline_twitter/scraper"
)
func TestParseAPIUrlCard(t *testing.T) {
data, err := ioutil.ReadFile("test_responses/url_card.json")
if err != nil {
panic(err)
}
var apiCard scraper.APICard
err = json.Unmarshal(data, &apiCard)
if err != nil {
t.Fatal(err.Error())
}
url := scraper.ParseAPIUrlCard(apiCard)
expected_domain := "reason.com"
if url.Domain != expected_domain {
t.Errorf("Expected %q, got %q", expected_domain, url.Domain)
}
expected_title := "L.A. Teachers Union Leader: 'There's No Such Thing As Learning Loss'"
if url.Title != expected_title {
t.Errorf("Expected %q, got %q", expected_title, url.Title)
}
expected_description := "\"Its OK that our babies may not have learned all their times tables,\" says Cecily Myart-Cruz. \"They learned resilience.\""
if url.Description != expected_description {
t.Errorf("Expected %q, got %q", expected_description, url.Description)
}
expected_remote_url := "https://pbs.twimg.com/card_img/1434998862305968129/odDi9EqO?format=jpg&name=600x600"
if url.ThumbnailRemoteUrl != expected_remote_url {
t.Errorf("Expected %q, got %q", expected_remote_url, url.ThumbnailRemoteUrl)
}
expected_local_filename := "odDi9EqO_600x600.jpg"
if url.ThumbnailLocalPath != expected_local_filename {
t.Errorf("Expected %q, got %q", expected_local_filename, url.ThumbnailLocalPath)
}
expected_creator_id := scraper.UserID(155581583)
if url.CreatorID != expected_creator_id {
t.Errorf("Expected %d, got %d", expected_creator_id, url.CreatorID)
}
expected_site_id := scraper.UserID(16467567)
if url.SiteID != expected_site_id {
t.Errorf("Expected %d, got %d", expected_site_id, url.SiteID)
}
if url.IsContentDownloaded {
t.Errorf("Expected it not to be downloaded, but it was")
}
}