139 lines
3.8 KiB
Go
139 lines
3.8 KiB
Go
package scraper
|
|
|
|
import (
|
|
"fmt"
|
|
"net/url"
|
|
"path"
|
|
"regexp"
|
|
)
|
|
|
|
type Url struct {
|
|
TweetID TweetID `db:"tweet_id"`
|
|
Domain string `db:"domain"`
|
|
Text string `db:"text"`
|
|
ShortText string `db:"short_text"`
|
|
Title string `db:"title"`
|
|
Description string `db:"description"`
|
|
ThumbnailWidth int `db:"thumbnail_width"`
|
|
ThumbnailHeight int `db:"thumbnail_height"`
|
|
ThumbnailRemoteUrl string `db:"thumbnail_remote_url"`
|
|
ThumbnailLocalPath string `db:"thumbnail_local_path"`
|
|
CreatorID UserID `db:"creator_id"`
|
|
SiteID UserID `db:"site_id"`
|
|
|
|
HasCard bool `db:"has_card"`
|
|
HasThumbnail bool `db:"has_thumbnail"`
|
|
IsContentDownloaded bool `db:"is_content_downloaded"`
|
|
}
|
|
|
|
func (u Url) GetDomain() string {
|
|
if u.Domain != "" {
|
|
return u.Domain
|
|
}
|
|
urlstruct, err := url.Parse(u.Text)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
return urlstruct.Host
|
|
}
|
|
|
|
func ParseAPIUrlCard(apiCard APICard) Url {
|
|
values := apiCard.BindingValues
|
|
ret := Url{}
|
|
ret.HasCard = true
|
|
|
|
ret.Domain = values.Domain.Value
|
|
ret.Title = values.Title.Value
|
|
ret.Description = values.Description.Value
|
|
ret.IsContentDownloaded = false
|
|
ret.CreatorID = UserID(values.Creator.UserValue.Value)
|
|
ret.SiteID = UserID(values.Site.UserValue.Value)
|
|
|
|
var thumbnail_url string
|
|
|
|
if apiCard.Name == "summary_large_image" || apiCard.Name == "summary" {
|
|
thumbnail_url = values.Thumbnail.ImageValue.Url
|
|
} else if apiCard.Name == "player" {
|
|
thumbnail_url = values.PlayerImage.ImageValue.Url
|
|
} else {
|
|
panic("Unknown card type: " + apiCard.Name)
|
|
}
|
|
|
|
if thumbnail_url != "" {
|
|
ret.HasThumbnail = true
|
|
ret.ThumbnailRemoteUrl = thumbnail_url
|
|
ret.ThumbnailLocalPath = get_thumbnail_local_path(thumbnail_url)
|
|
ret.ThumbnailWidth = values.Thumbnail.ImageValue.Width
|
|
ret.ThumbnailHeight = values.Thumbnail.ImageValue.Height
|
|
}
|
|
|
|
return ret
|
|
}
|
|
|
|
func get_prefixed_path(p string) string {
|
|
local_prefix_regex := regexp.MustCompile(`^[\w-]{2}`)
|
|
local_prefix := local_prefix_regex.FindString(p)
|
|
if len(local_prefix) != 2 {
|
|
panic(fmt.Sprintf("Unable to extract a 2-letter prefix for filename %s", p))
|
|
}
|
|
return path.Join(local_prefix, p)
|
|
}
|
|
|
|
func get_thumbnail_local_path(remote_url string) string {
|
|
u, err := url.Parse(remote_url)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
if u.RawQuery == "" {
|
|
return path.Base(u.Path)
|
|
}
|
|
query_params, err := url.ParseQuery(u.RawQuery)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
return get_prefixed_path(
|
|
fmt.Sprintf("%s_%s.%s", path.Base(u.Path), query_params["name"][0], query_params["format"][0]),
|
|
)
|
|
}
|
|
|
|
// Given an URL, try to parse it as a tweet url.
|
|
// The bool is an `is_ok` value; true if the parse was successful, false if it didn't match
|
|
func TryParseTweetUrl(s string) (UserHandle, TweetID, bool) {
|
|
parsed_url, err := url.Parse(s)
|
|
if err != nil {
|
|
return UserHandle(""), TweetID(0), false
|
|
}
|
|
|
|
if parsed_url.Host != "twitter.com" && parsed_url.Host != "mobile.twitter.com" && parsed_url.Host != "x.com" {
|
|
return UserHandle(""), TweetID(0), false
|
|
}
|
|
|
|
r := regexp.MustCompile(`^/(\w+)/status/(\d+)$`)
|
|
matches := r.FindStringSubmatch(parsed_url.Path)
|
|
if matches == nil {
|
|
return UserHandle(""), TweetID(0), false
|
|
}
|
|
if len(matches) != 3 { // matches[0] is the full string
|
|
panic(matches)
|
|
}
|
|
return UserHandle(matches[1]), TweetID(int_or_panic(matches[2])), true
|
|
}
|
|
|
|
/**
|
|
* Given a tweet URL, return the corresponding user handle.
|
|
* If tweet url is not valid, return an error.
|
|
*/
|
|
func ParseHandleFromTweetUrl(tweet_url string) (UserHandle, error) {
|
|
short_url_regex := regexp.MustCompile(`^https://t.co/\w{5,20}$`)
|
|
if short_url_regex.MatchString(tweet_url) {
|
|
tweet_url = ExpandShortUrl(tweet_url)
|
|
}
|
|
|
|
ret, _, is_ok := TryParseTweetUrl(tweet_url)
|
|
if !is_ok {
|
|
return "", fmt.Errorf("Invalid tweet url: %s", tweet_url)
|
|
}
|
|
return ret, nil
|
|
}
|