Created
August 20, 2020 15:48
-
-
Save runo280/750e10c5f474f6d972eef85df8196cc8 to your computer and use it in GitHub Desktop.
c@ster
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package io.github.runo280; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
import java.io.IOException; | |
import java.net.URL; | |
public class Main { | |
public static void main(String[] args) throws IOException { | |
Document document = Jsoup.parse(new URL("https://caster.io/courses/android-machine-learning-with-tensorflow-lite-and-tf-keras"), 20 * 1000); | |
Element container = document.selectFirst("div[course-id]"); | |
Elements links = container.getElementsByTag("a"); | |
int index = 0; | |
for (Element e : links) { | |
index++; | |
String fileName = index + "- " + e.selectFirst("a").text().replaceAll("\\s\\d\\d\\:\\d\\d\\s.*$", "") + ".mp4"; | |
String link = e.attr("href"); | |
System.out.println(String.format("youtube-dl --config-location casterconf \"%s\" -o \"%s\"", link, fileName)); | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* This Java source file was generated by the Gradle 'init' task. | |
*/ | |
package io.runo280.casterdl; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
import java.io.File; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.net.URL; | |
public class App { | |
public static final String COURSES_URL = "https://caster.io/courses"; | |
static String BASH_HEADER = "#!/usr/bin/env bash"; | |
/*static String OPTIONS = "--ignore-config -f hd_mp4_video-1 --cookies cookie -i -c --external-downloader aria2c " + | |
"--external-downloader-args \"-c -s16 -k1M -x16 --enable-color=true --human-readable=true\" " + | |
"--user-agent \"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0\"";*/ | |
public static void main(String args[]) throws IOException { | |
Document coursesPage = getPage(COURSES_URL); | |
Elements courseList = coursesPage.select("a.cioc-cardgroup__item"); | |
for (Element c : courseList) { | |
String link = "https://caster.io" + c.attr("href"); | |
System.out.println(link); | |
String name = c.selectFirst("span.cioc-link--yellow").text().replaceAll(" ", "_"); | |
System.out.println(name + "\n\n"); | |
createDl(name, link); | |
} | |
} | |
private static Document getPage(String url) throws IOException { | |
return Jsoup.parse(new URL(url), 20 * 1000); | |
} | |
static void createDl(String name, String url) throws IOException { | |
Document document = getPage(url); | |
File file = new File("dl_" + FileUtils.sanitizeFilename(name) + ".sh"); | |
FileWriter fr = new FileWriter(file, true); | |
fr.write(BASH_HEADER); | |
fr.write("\nmkdir " + name + "\n"); | |
Element container = document.selectFirst("div[course-id]"); | |
Elements links = container.getElementsByTag("a"); | |
int index = 0; | |
for (Element e : links) { | |
index++; | |
String fileName = index + "- " + e.selectFirst("a").text().replaceAll("\\s\\d\\d\\:\\d\\d\\s.*$", "") + ".mp4"; | |
fileName = FileUtils.sanitizeFilename(fileName); | |
String link = e.attr("href"); | |
String line = String.format("\nyoutube-dl %s \"%s\" -o \"%s\\%s\"", getArgs(), link, name, fileName); | |
fr.write(line); | |
} | |
fr.close(); | |
} | |
static String getArgs(){ | |
StringBuilder sb = new StringBuilder(); | |
sb.append("--ignore-config "); | |
sb.append("--cookies cookie "); | |
sb.append("--external-downloader aria2c "); | |
sb.append("--user-agent \"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0\" "); | |
sb.append("-i -c "); | |
sb.append("-f \"(worst[width>=1080])[protocol^=http]/(worst[width>=720])[protocol^=http]\" "); | |
sb.append("--external-downloader-args \"-c -s16 -k1M -x16 --enable-color=true --human-readable=true\" "); | |
return sb.toString(); | |
} | |
} | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import string | |
import requests | |
from bs4 import BeautifulSoup | |
''' | |
format code extension resolution note | |
mp4-224p mp4 400x224 182k , mp4 container, h264, 1.63MiB | |
iphone-360p mp4 640x360 227k , mp4 container, h264, 2.03MiB | |
md_mp4-540p mp4 960x540 295k , mp4 container, h264, 2.63MiB | |
hd_mp4-720p mp4 1280x720 361k , mp4 container, h264, 3.22MiB | |
hd_mp4-1080p mp4 1920x1080 526k , mp4 container, h264, 4.68MiB | |
original bin 1920x1080 2660k , 23.68MiB (best) | |
''' | |
def format_filename(s): | |
"""Take a string and return a valid filename constructed from the string. | |
Uses a whitelist approach: any characters not present in valid_chars are | |
removed. Also spaces are replaced with underscores. | |
Note: this method may produce invalid filenames such as ``, `.` or `..` | |
When I use this method I prepend a date string like '2009_01_15_19_46_32_' | |
and append a file extension like '.txt', so I avoid the potential of using | |
an invalid filename. | |
""" | |
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) | |
filename = ''.join(c for c in s if c in valid_chars) | |
filename = filename.replace(' ', '_') # I don't like spaces in filenames. | |
return filename | |
course_url = 'https://caster.io/courses/kotlin-programming-language' | |
request = requests.get(course_url) | |
source = BeautifulSoup(request.text, 'html.parser') | |
course = source.find_all('div', {'course-id': re.compile(r'.*')}) | |
index = 0 | |
for r in course: | |
lesson = r.find_all('a') | |
print(len(lesson)) | |
for item in lesson: | |
index += 1 | |
file_name = item.text.strip().partition('\n')[0] | |
file_name = format_filename(file_name) + '.mp4' | |
file_name = file_name.replace('_-_', '_') | |
file_name = re.sub(r'\d+\.', '', file_name) | |
file_name = f'{index:03d}' + file_name | |
video_format = 'hd_mp4-1080p' | |
url = item['href'] | |
command = f'youtube-dl --ignore-config -f {video_format} -i -c --external-downloader aria2c ' \ | |
f'--external-downloader-args "-c -s16 -k1M -x16 --enable-color=true --human-readable=true" ' \ | |
f'--user-agent "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0" "{url}" ' \ | |
f'-o "{file_name}" ' | |
print(command) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment