Last active
November 4, 2019 15:59
-
-
Save gabemarshall/2f7d815050b463faf6b8edab18ddfa12 to your computer and use it in GitHub Desktop.
Ruby script to search Github and dump a list of individual files as well as repos
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# gem install faraday && gem install tty-prompt | |
# export GITHUB_AUTH=[replace with Github API token] | |
# Ex: ruby gitr_dump.rb '"corp.contoso.com"' | |
require "faraday" | |
require "json" | |
require "csv" | |
require "rb-readline" | |
$search_results_urls = [] | |
$search_results_giturls = [] | |
search_term = ARGV[0] | |
if search_term.nil? | |
puts "Missing search term" | |
exit | |
end | |
if ENV["GITHUB_AUTH"].nil? | |
puts "Missing required Github OAUTH Token" | |
exit | |
end | |
search_query = "#{search_term}" | |
def do_search_api(term, page) | |
connection = Faraday.new(url: "https://api.github.com/") | |
pg = page.to_s | |
res = connection.get "/search/code?page=" + pg + "&per_page=100&q=" + term do |request| | |
request.headers["Content-Type"] = "application/json" | |
request.headers["Authorization"] = "token " + ENV["GITHUB_AUTH"] | |
end | |
return res | |
end | |
def gitrob_output(items) | |
if items.class == Array | |
items.each do |item| | |
out_gitrob_csv(item) | |
end | |
else | |
puts "err" | |
end | |
end | |
results = JSON.parse(do_search_api(search_query, 1).body) | |
search_count = results["total_count"] | |
if search_count.nil? | |
binding.pry | |
end | |
search_wait = 0 | |
if search_count >= 3000 | |
puts "Results are over 3000! (dump will be throttled)" | |
search_wait = 5 | |
end | |
puts "Your search returned a total of " + search_count.to_s + " results." | |
puts "Would you like to continue? (Y/n)" | |
continue_bool = $stdin.gets.chomp | |
continue_bool = continue_bool.upcase | |
if continue_bool != "Y" | |
puts "Bye!" | |
exit | |
end | |
#created:>2018-10-25 | |
end_length = (search_count / 100).ceil + 1 | |
#end_length = 5 | |
$search_results_urls.concat(results["items"].map { |repo| repo["html_url"].gsub("blob", "raw") }) # Parse the first result | |
$search_results_giturls.concat(results["items"].map { |repo| repo["repository"]["html_url"] }) # Parse the first result | |
for count in 2..end_length | |
puts "Requesting page #{count} of #{end_length}" | |
results = JSON.parse(do_search_api(search_query, count).body) | |
sleep search_wait | |
$search_results_urls.concat(results["items"].map { |repo| repo["html_url"].gsub("blob", "raw") }) rescue binding.pry | |
$search_results_giturls.concat(results["items"].map { |repo| repo["repository"]["html_url"] }) rescue binding.pry | |
end | |
$search_results_urls.uniq! | |
$search_results_giturls.uniq! | |
out_urls_str = $search_results_urls.join(" | |
") | |
out_urls_filename = "dumped_direct_file_urls.txt" | |
out_giturls_str = $search_results_giturls.join(" | |
") | |
out_giturls_filename = "dumped_repo_urls.txt" | |
File.open(out_urls_filename, "w") { |file| file.puts(out_urls_str) } | |
File.open(out_giturls_filename, "w") { |file| file.puts(out_giturls_str) } | |
puts "#{$search_results_urls.length.to_s} links to potentially interesting files saved to #{out_urls_filename}" | |
puts "#{$search_results_giturls.length.to_s} links to repos saved to #{out_giturls_filename}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment