Add scraping of videos from tweets
This commit is contained in:
parent
15241f4f43
commit
647dd8aa6b
@ -158,7 +158,7 @@ func LoadProfile(profile_dir string) (Profile, error) {
|
||||
if err != nil {
|
||||
return Profile{}, err
|
||||
}
|
||||
db, err := sql.Open("sqlite3", sqlite_file)
|
||||
db, err := sql.Open("sqlite3", sqlite_file + "?_foreign_keys=on")
|
||||
if err != nil {
|
||||
return Profile{}, err
|
||||
}
|
||||
|
@ -6,6 +6,14 @@ import (
|
||||
"encoding/json"
|
||||
)
|
||||
|
||||
type SortableVariants []struct {
|
||||
Bitrate int `json:"bitrate,omitempty"`
|
||||
URL string `json:"url"`
|
||||
}
|
||||
func (v SortableVariants) Len() int { return len(v) }
|
||||
func (v SortableVariants) Swap(i, j int) { v[i], v[j] = v[j], v[i] }
|
||||
func (v SortableVariants) Less(i, j int) bool { return v[i].Bitrate > v[j].Bitrate }
|
||||
|
||||
type APITweet struct {
|
||||
ID string `json:"id_str"`
|
||||
ConversationIDStr string `json:"conversation_id_str"`
|
||||
@ -36,10 +44,7 @@ type APITweet struct {
|
||||
MediaURLHttps string `json:"media_url_https"`
|
||||
Type string `json:"type"`
|
||||
VideoInfo struct {
|
||||
Variants []struct {
|
||||
Bitrate int `json:"bitrate,omitempty"`
|
||||
URL string `json:"url"`
|
||||
} `json:"variants"`
|
||||
Variants SortableVariants `json:"variants"`
|
||||
} `json:"video_info"`
|
||||
} `json:"media"`
|
||||
} `json:"extended_entities"`
|
||||
|
1
scraper/test_responses/tweet_with_video.json
Normal file
1
scraper/test_responses/tweet_with_video.json
Normal file
@ -0,0 +1 @@
|
||||
{"created_at":"Sun Jul 25 00:49:32 +0000 2021","id_str":"1419097448786452480","full_text":"Geezus \n\n https://t.co/HfIexbaCLe","display_text_range":[0,33],"entities":{"media":[{"id_str":"1418951950020845568","indices":[10,33],"media_url":"http://pbs.twimg.com/ext_tw_video_thumb/1418951950020845568/pu/img/eUTaYYfuAJ8FyjUi.jpg","media_url_https":"https://pbs.twimg.com/ext_tw_video_thumb/1418951950020845568/pu/img/eUTaYYfuAJ8FyjUi.jpg","url":"https://t.co/HfIexbaCLe","display_url":"pic.twitter.com/HfIexbaCLe","expanded_url":"https://twitter.com/crazykarens/status/1418953196484038665/video/1","type":"photo","original_info":{"width":720,"height":1280},"sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"small":{"w":383,"h":680,"resize":"fit"},"large":{"w":720,"h":1280,"resize":"fit"},"medium":{"w":675,"h":1200,"resize":"fit"}},"source_status_id_str":"1418953196484038665","source_user_id_str":"1269387380127080448"}]},"extended_entities":{"media":[{"id_str":"1418951950020845568","indices":[10,33],"media_url":"http://pbs.twimg.com/ext_tw_video_thumb/1418951950020845568/pu/img/eUTaYYfuAJ8FyjUi.jpg","media_url_https":"https://pbs.twimg.com/ext_tw_video_thumb/1418951950020845568/pu/img/eUTaYYfuAJ8FyjUi.jpg","url":"https://t.co/HfIexbaCLe","display_url":"pic.twitter.com/HfIexbaCLe","expanded_url":"https://twitter.com/crazykarens/status/1418953196484038665/video/1","type":"video","original_info":{"width":720,"height":1280},"sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"small":{"w":383,"h":680,"resize":"fit"},"large":{"w":720,"h":1280,"resize":"fit"},"medium":{"w":675,"h":1200,"resize":"fit"}},"source_status_id_str":"1418953196484038665","source_user_id_str":"1269387380127080448","video_info":{"aspect_ratio":[9,16],"duration_millis":88300,"variants":[{"bitrate":632000,"content_type":"video/mp4","url":"https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/320x568/IXaQ5rPyf9mbD1aD.mp4?tag=12"},{"bitrate":950000,"content_type":"video/mp4","url":"https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/480x852/Z3vRCGq3hctSx7qv.mp4?tag=12"},{"content_type":"application/x-mpegURL","url":"https://video.twimg.com/ext_tw_video/1418951950020845568/pu/pl/cB33qJYlO9sdI44P.m3u8?tag=12&container=fmp4"},{"bitrate":2176000,"content_type":"video/mp4","url":"https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/720x1280/sm4iL9_f8Lclh0aa.mp4?tag=12"}]},"media_key":"7_1418951950020845568","ext_alt_text":null,"ext_media_availability":{"status":"available"},"ext_media_color":{"palette":[{"rgb":{"red":57,"green":59,"blue":64},"percentage":48.69},{"rgb":{"red":123,"green":137,"blue":108},"percentage":19.14},{"rgb":{"red":109,"green":112,"blue":133},"percentage":10.69},{"rgb":{"red":135,"green":158,"blue":98},"percentage":4.7},{"rgb":{"red":227,"green":214,"blue":206},"percentage":2.42}]},"ext":{"mediaStats":{"r":{"ok":{"viewCount":"275952"}},"ttl":-1}},"additional_media_info":{"monetizable":false,"source_user":{"id_str":"1269387380127080448","name":"Karen","screen_name":"crazykarens","location":"Karen","description":"EXCLUSIVE Karen Content & Updates","entities":{"description":{}},"followers_count":57393,"fast_followers_count":0,"normal_followers_count":57393,"friends_count":0,"listed_count":122,"created_at":"Sat Jun 06 21:55:18 +0000 2020","favourites_count":5,"statuses_count":566,"media_count":489,"profile_image_url_https":"https://pbs.twimg.com/profile_images/1398068069830901761/6R8k799k_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/1269387380127080448/1591619036","profile_image_extensions_alt_text":null,"profile_image_extensions_media_availability":null,"profile_image_extensions_media_color":{"palette":[{"rgb":{"red":255,"green":255,"blue":255},"percentage":84.59},{"rgb":{"red":43,"green":43,"blue":43},"percentage":9.22}]},"profile_image_extensions":{"mediaStats":{"r":{"missing":null},"ttl":-1}},"profile_banner_extensions_media_color":{"palette":[{"rgb":{"red":247,"green":247,"blue":247},"percentage":90.06},{"rgb":{"red":33,"green":33,"blue":33},"percentage":6.29},{"rgb":{"red":119,"green":119,"blue":119},"percentage":4.94}]},"profile_banner_extensions_alt_text":null,"profile_banner_extensions_media_availability":null,"profile_banner_extensions":{"mediaStats":{"r":{"missing":null},"ttl":-1}},"profile_link_color":"1DA1F2","has_extended_profile":true,"default_profile":true,"pinned_tweet_ids":[],"pinned_tweet_ids_str":[],"has_custom_timelines":true,"advertiser_account_type":"none","advertiser_account_service_levels":[],"profile_interstitial_type":"","business_profile_state":"none","translator_type":"none","withheld_in_countries":[],"ext":{"highlightedLabel":{"r":{"ok":{}},"ttl":-1}}}}}]},"source":"<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>","user_id_str":"358545917","retweet_count":53,"favorite_count":391,"reply_count":123,"quote_count":20,"conversation_id_str":"1419097448786452480","possibly_sensitive_editable":true,"lang":"nl"}
|
@ -3,6 +3,7 @@ package scraper
|
||||
import (
|
||||
"time"
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"offline_twitter/terminal_utils"
|
||||
)
|
||||
@ -105,6 +106,19 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
|
||||
}
|
||||
|
||||
ret.QuotedTweet = TweetID(apiTweet.QuotedStatusIDStr)
|
||||
|
||||
for _, entity := range apiTweet.ExtendedEntities.Media {
|
||||
if entity.Type != "video" {
|
||||
continue
|
||||
}
|
||||
if len(apiTweet.ExtendedEntities.Media) != 1 {
|
||||
panic(fmt.Sprintf("Surprising ExtendedEntities: %v", apiTweet.ExtendedEntities.Media))
|
||||
}
|
||||
variants := apiTweet.ExtendedEntities.Media[0].VideoInfo.Variants
|
||||
sort.Sort(variants)
|
||||
ret.Video = variants[0].URL
|
||||
ret.Images = []string{}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -138,6 +138,30 @@ func TestParseSingleTweet2(t *testing.T) {
|
||||
}
|
||||
|
||||
|
||||
func TestParseTweetWithVideo(t *testing.T) {
|
||||
data, err := ioutil.ReadFile("test_responses/tweet_with_video.json")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
var apitweet scraper.APITweet
|
||||
err = json.Unmarshal(data, &apitweet)
|
||||
if err != nil {
|
||||
t.Errorf(err.Error())
|
||||
}
|
||||
tweet, err := scraper.ParseSingleTweet(apitweet)
|
||||
if err != nil {
|
||||
t.Errorf(err.Error())
|
||||
}
|
||||
expected_video := "https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/720x1280/sm4iL9_f8Lclh0aa.mp4?tag=12"
|
||||
if tweet.Video != expected_video {
|
||||
t.Errorf("Expected video %q, but got %q", expected_video, tweet.Video)
|
||||
}
|
||||
|
||||
if len(tweet.Images) != 0 {
|
||||
t.Errorf("Should not have any images, but has %d", len(tweet.Images))
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseTweetResponse(t *testing.T) {
|
||||
data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json")
|
||||
if err != nil {
|
||||
|
Loading…
x
Reference in New Issue
Block a user