From 647dd8aa6bf7ac8e44f80aa9002a855a558bde0e Mon Sep 17 00:00:00 2001 From: Alessio Date: Sun, 25 Jul 2021 14:51:17 -0700 Subject: [PATCH] Add scraping of videos from tweets --- persistence/profile.go | 2 +- scraper/api_types.go | 13 +++++++---- scraper/test_responses/tweet_with_video.json | 1 + scraper/tweet.go | 14 ++++++++++++ scraper/tweet_test.go | 24 ++++++++++++++++++++ 5 files changed, 49 insertions(+), 5 deletions(-) create mode 100644 scraper/test_responses/tweet_with_video.json diff --git a/persistence/profile.go b/persistence/profile.go index 893e449..0b427e5 100644 --- a/persistence/profile.go +++ b/persistence/profile.go @@ -158,7 +158,7 @@ func LoadProfile(profile_dir string) (Profile, error) { if err != nil { return Profile{}, err } - db, err := sql.Open("sqlite3", sqlite_file) + db, err := sql.Open("sqlite3", sqlite_file + "?_foreign_keys=on") if err != nil { return Profile{}, err } diff --git a/scraper/api_types.go b/scraper/api_types.go index 2bf9094..fe8994d 100644 --- a/scraper/api_types.go +++ b/scraper/api_types.go @@ -6,6 +6,14 @@ import ( "encoding/json" ) +type SortableVariants []struct { + Bitrate int `json:"bitrate,omitempty"` + URL string `json:"url"` +} +func (v SortableVariants) Len() int { return len(v) } +func (v SortableVariants) Swap(i, j int) { v[i], v[j] = v[j], v[i] } +func (v SortableVariants) Less(i, j int) bool { return v[i].Bitrate > v[j].Bitrate } + type APITweet struct { ID string `json:"id_str"` ConversationIDStr string `json:"conversation_id_str"` @@ -36,10 +44,7 @@ type APITweet struct { MediaURLHttps string `json:"media_url_https"` Type string `json:"type"` VideoInfo struct { - Variants []struct { - Bitrate int `json:"bitrate,omitempty"` - URL string `json:"url"` - } `json:"variants"` + Variants SortableVariants `json:"variants"` } `json:"video_info"` } `json:"media"` } `json:"extended_entities"` diff --git a/scraper/test_responses/tweet_with_video.json b/scraper/test_responses/tweet_with_video.json new file mode 100644 index 0000000..1606280 --- /dev/null +++ b/scraper/test_responses/tweet_with_video.json @@ -0,0 +1 @@ +{"created_at":"Sun Jul 25 00:49:32 +0000 2021","id_str":"1419097448786452480","full_text":"Geezus \n\n https://t.co/HfIexbaCLe","display_text_range":[0,33],"entities":{"media":[{"id_str":"1418951950020845568","indices":[10,33],"media_url":"http://pbs.twimg.com/ext_tw_video_thumb/1418951950020845568/pu/img/eUTaYYfuAJ8FyjUi.jpg","media_url_https":"https://pbs.twimg.com/ext_tw_video_thumb/1418951950020845568/pu/img/eUTaYYfuAJ8FyjUi.jpg","url":"https://t.co/HfIexbaCLe","display_url":"pic.twitter.com/HfIexbaCLe","expanded_url":"https://twitter.com/crazykarens/status/1418953196484038665/video/1","type":"photo","original_info":{"width":720,"height":1280},"sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"small":{"w":383,"h":680,"resize":"fit"},"large":{"w":720,"h":1280,"resize":"fit"},"medium":{"w":675,"h":1200,"resize":"fit"}},"source_status_id_str":"1418953196484038665","source_user_id_str":"1269387380127080448"}]},"extended_entities":{"media":[{"id_str":"1418951950020845568","indices":[10,33],"media_url":"http://pbs.twimg.com/ext_tw_video_thumb/1418951950020845568/pu/img/eUTaYYfuAJ8FyjUi.jpg","media_url_https":"https://pbs.twimg.com/ext_tw_video_thumb/1418951950020845568/pu/img/eUTaYYfuAJ8FyjUi.jpg","url":"https://t.co/HfIexbaCLe","display_url":"pic.twitter.com/HfIexbaCLe","expanded_url":"https://twitter.com/crazykarens/status/1418953196484038665/video/1","type":"video","original_info":{"width":720,"height":1280},"sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"small":{"w":383,"h":680,"resize":"fit"},"large":{"w":720,"h":1280,"resize":"fit"},"medium":{"w":675,"h":1200,"resize":"fit"}},"source_status_id_str":"1418953196484038665","source_user_id_str":"1269387380127080448","video_info":{"aspect_ratio":[9,16],"duration_millis":88300,"variants":[{"bitrate":632000,"content_type":"video/mp4","url":"https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/320x568/IXaQ5rPyf9mbD1aD.mp4?tag=12"},{"bitrate":950000,"content_type":"video/mp4","url":"https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/480x852/Z3vRCGq3hctSx7qv.mp4?tag=12"},{"content_type":"application/x-mpegURL","url":"https://video.twimg.com/ext_tw_video/1418951950020845568/pu/pl/cB33qJYlO9sdI44P.m3u8?tag=12&container=fmp4"},{"bitrate":2176000,"content_type":"video/mp4","url":"https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/720x1280/sm4iL9_f8Lclh0aa.mp4?tag=12"}]},"media_key":"7_1418951950020845568","ext_alt_text":null,"ext_media_availability":{"status":"available"},"ext_media_color":{"palette":[{"rgb":{"red":57,"green":59,"blue":64},"percentage":48.69},{"rgb":{"red":123,"green":137,"blue":108},"percentage":19.14},{"rgb":{"red":109,"green":112,"blue":133},"percentage":10.69},{"rgb":{"red":135,"green":158,"blue":98},"percentage":4.7},{"rgb":{"red":227,"green":214,"blue":206},"percentage":2.42}]},"ext":{"mediaStats":{"r":{"ok":{"viewCount":"275952"}},"ttl":-1}},"additional_media_info":{"monetizable":false,"source_user":{"id_str":"1269387380127080448","name":"Karen","screen_name":"crazykarens","location":"Karen","description":"EXCLUSIVE Karen Content & Updates","entities":{"description":{}},"followers_count":57393,"fast_followers_count":0,"normal_followers_count":57393,"friends_count":0,"listed_count":122,"created_at":"Sat Jun 06 21:55:18 +0000 2020","favourites_count":5,"statuses_count":566,"media_count":489,"profile_image_url_https":"https://pbs.twimg.com/profile_images/1398068069830901761/6R8k799k_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/1269387380127080448/1591619036","profile_image_extensions_alt_text":null,"profile_image_extensions_media_availability":null,"profile_image_extensions_media_color":{"palette":[{"rgb":{"red":255,"green":255,"blue":255},"percentage":84.59},{"rgb":{"red":43,"green":43,"blue":43},"percentage":9.22}]},"profile_image_extensions":{"mediaStats":{"r":{"missing":null},"ttl":-1}},"profile_banner_extensions_media_color":{"palette":[{"rgb":{"red":247,"green":247,"blue":247},"percentage":90.06},{"rgb":{"red":33,"green":33,"blue":33},"percentage":6.29},{"rgb":{"red":119,"green":119,"blue":119},"percentage":4.94}]},"profile_banner_extensions_alt_text":null,"profile_banner_extensions_media_availability":null,"profile_banner_extensions":{"mediaStats":{"r":{"missing":null},"ttl":-1}},"profile_link_color":"1DA1F2","has_extended_profile":true,"default_profile":true,"pinned_tweet_ids":[],"pinned_tweet_ids_str":[],"has_custom_timelines":true,"advertiser_account_type":"none","advertiser_account_service_levels":[],"profile_interstitial_type":"","business_profile_state":"none","translator_type":"none","withheld_in_countries":[],"ext":{"highlightedLabel":{"r":{"ok":{}},"ttl":-1}}}}}]},"source":"Twitter for iPhone","user_id_str":"358545917","retweet_count":53,"favorite_count":391,"reply_count":123,"quote_count":20,"conversation_id_str":"1419097448786452480","possibly_sensitive_editable":true,"lang":"nl"} diff --git a/scraper/tweet.go b/scraper/tweet.go index f6abaa5..5665232 100644 --- a/scraper/tweet.go +++ b/scraper/tweet.go @@ -3,6 +3,7 @@ package scraper import ( "time" "fmt" + "sort" "offline_twitter/terminal_utils" ) @@ -105,6 +106,19 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { } ret.QuotedTweet = TweetID(apiTweet.QuotedStatusIDStr) + + for _, entity := range apiTweet.ExtendedEntities.Media { + if entity.Type != "video" { + continue + } + if len(apiTweet.ExtendedEntities.Media) != 1 { + panic(fmt.Sprintf("Surprising ExtendedEntities: %v", apiTweet.ExtendedEntities.Media)) + } + variants := apiTweet.ExtendedEntities.Media[0].VideoInfo.Variants + sort.Sort(variants) + ret.Video = variants[0].URL + ret.Images = []string{} + } return } diff --git a/scraper/tweet_test.go b/scraper/tweet_test.go index 2486529..f77b66b 100644 --- a/scraper/tweet_test.go +++ b/scraper/tweet_test.go @@ -138,6 +138,30 @@ func TestParseSingleTweet2(t *testing.T) { } +func TestParseTweetWithVideo(t *testing.T) { + data, err := ioutil.ReadFile("test_responses/tweet_with_video.json") + if err != nil { + panic(err) + } + var apitweet scraper.APITweet + err = json.Unmarshal(data, &apitweet) + if err != nil { + t.Errorf(err.Error()) + } + tweet, err := scraper.ParseSingleTweet(apitweet) + if err != nil { + t.Errorf(err.Error()) + } + expected_video := "https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/720x1280/sm4iL9_f8Lclh0aa.mp4?tag=12" + if tweet.Video != expected_video { + t.Errorf("Expected video %q, but got %q", expected_video, tweet.Video) + } + + if len(tweet.Images) != 0 { + t.Errorf("Should not have any images, but has %d", len(tweet.Images)) + } +} + func TestParseTweetResponse(t *testing.T) { data, err := ioutil.ReadFile("test_responses/michael_malice_feed.json") if err != nil {