Created
August 5, 2013 19:16
-
-
Save no-reply/6158616 to your computer and use it in GitHub Desktop.
Blobs of bagit-for-hydra stuff
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Hybag | |
class BagImportError < StandardError | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'bagit' | |
module Hybag | |
module Baggable | |
def write_bag(path = '') | |
raise Exception if self.pid == '__DO_NOT_USE__' | |
# delete any existing bags before making a new one | |
self.delete_bag(path) | |
path = bag_dir(path) | |
FileUtils.mkdir_p path unless File.directory? path | |
bag = BagIt::Bag.new(path) | |
#TODO: Writing to bag files is naive; reads file out and writes it. | |
# Possibly there is a better way to do this. | |
# add the datastreams to the bag, then manifest | |
datastreams.each do |label, ds| | |
unless ds.content.nil? | |
label = label + mime_extension(ds) | |
if bag_tags.include? ds | |
bag.add_tag_file(label) { |f| | |
f.puts ds.content | |
} | |
elsif bag_fedora_tags.values.include? ds | |
bag.add_tag_file('fedora/' + label) { |f| | |
f.puts ds.content | |
} | |
else | |
bag.add_file(label) { |f| | |
f.puts ds.content.force_encoding('UTF-8') | |
} | |
end | |
end | |
end | |
bag.tagmanifest! | |
bag.manifest! | |
return bag | |
end | |
# just an alias for the export job's perform method against self | |
def queue_bag_export(path = '') | |
Resque.enqueue(Exporter, self.pid, path) | |
end | |
def delete_bag(path = '') | |
bag_path = bag_dir(path) | |
FileUtils.rm_rf bag_path if File.directory? bag_path | |
end | |
private | |
# create a safe cross-platform bag path | |
def bag_dir(path) | |
# TODO: make bag directory configurable? | |
path = Rails.root.join("tmp/bags", path) unless path.to_s.starts_with? Rails.root.join("tmp/bags").to_s | |
return File.join(path, self.pid.safe_filename) | |
end | |
#TODO: allow selection of specific content datastreams to bag | |
# to ignore thumbnails and other derivitives, for example. | |
# return all content files for bag | |
def bag_contents | |
self.datastreams.reject { |label, ds| bag_tags.include?(ds) or bag_fedora_tags.include?(ds) } | |
end | |
# return all non-fedora tag files | |
def bag_tags | |
self.metadata_streams | |
end | |
# return fedora tag files | |
def bag_fedora_tags | |
self.datastreams.select { |label, ds| ds.is_a?(ActiveFedora::RelsExtDatastream) or ds.dsid == "DC"} | |
end | |
def mime_extension(ds) | |
if ds.kind_of?(ActiveFedora::NtriplesRDFDatastream) | |
ext = 'nt' | |
else | |
if ds.mimeType == '' | |
ext = '' | |
else | |
ext = MIME::Types[ds.mimeType].first.extensions[0] | |
end | |
end | |
return '.' + ext | |
end | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'bagit' | |
require 'filemagic' | |
require 'rdf/ntriples' | |
module Hybag | |
# try to ingest the bag | |
def self.ingest(bag, needs_review=true) | |
raise BagImportError, "Bag is incomplete: #{bag.bag_dir}" unless bag.complete? | |
raise BagImportError, "Bag is inconsistent." unless bag.consistent? | |
raise BagImportError, "Bag is invalid." unless bag.valid? | |
model, collections = get_relations(bag) | |
for coll in collections | |
unless Collection.exists?(coll) | |
coll = Collection.new(pid: coll) | |
coll.title = OregonDigital::IdService.noidify(coll.pid) | |
coll.save | |
end | |
end | |
item = model.constantize.new | |
for ds in bag.bag_files | |
label = File.basename(ds, '.*') | |
opts = { | |
:mimeType => FileMagic.new(FileMagic::MAGIC_MIME).file(ds).split(';')[0], | |
:label => label, | |
:dsid => label | |
} | |
# forcing binary may not work for all content? | |
item.add_file_datastream(File.open(ds).read.force_encoding('BINARY'), opts) | |
end | |
# save to generate a pid | |
item.descMetadata.set = collections | |
item.save | |
import_desc_metadata(item, bag) | |
#TODO: add other tag files | |
item.review! unless needs_review | |
return item | |
end | |
def self.get_relations(bag) | |
if(File.exist?(File.join(bag.bag_dir,"fedora","RELS-EXT.rdf"))) | |
model, collections = item_from_rels(File.join(bag.bag_dir,"fedora","RELS-EXT.rdf")) | |
elsif(File.exist?(File.join(bag.bag_dir,"hybag.yml"))) | |
bagconf = YAML.load(File.read(File.join(bag.bag_dir,"hybag.yml"))) | |
model = bagconf["model"] | |
collections = bagconf["collections"] | |
else | |
#TODO: Fall back to ingest form if no RELS/config | |
model = "GenericAsset" | |
collections = [] | |
end | |
return model, collections | |
end | |
private | |
# Write descMetadata | |
def self.import_desc_metadata(item, bag) | |
#TODO: what if descMetadata comes in other formats? | |
#TODO: if there is more than one RDF datastream, merge the graph | |
graph = RDF::Graph.load(File.join(bag.bag_dir, 'descMetadata.nt')) | |
# This assumes that the first subject in the RDF is the bag item | |
#TODO: actually figure out which subject to overwrite | |
# could do this by trying to find one which is not also | |
# an object or predicate | |
itemSubject = graph.first_subject | |
graph.each_statement do |statement| | |
# Overwrite the subject if necessary | |
if statement.subject == itemSubject | |
item.descMetadata.append(item.descMetadata.rdf_subject, statement.predicate, statement.object) | |
else | |
item.descMetadata.append(statement.subject, statement.predicate, statement.object) | |
end | |
end | |
return item.save | |
end | |
# Search extract model and collection associations from a ferora RELS file | |
def self.item_from_rels(file) | |
#TODO: Move this method somewhere it can be used by other modules? | |
model_predicate = "info:fedora/fedora-system:def/model#hasModel" | |
coll_predicate = "info:fedora/fedora-system:def/relations-external#isMemberOf" | |
rels_graph = RDF::Graph.load(file) | |
model = 'GenericAsset' | |
if(rels_graph.has_predicate?(model_predicate)) | |
model = rels_graph.to_a.select{|x| x.predicate == model_predicate}[0].object.to_s | |
model["info:fedora/afmodel:"] = '' | |
end | |
collections = [] | |
if(rels_graph.has_predicate?(coll_predicate)) | |
collection_triples = rels_graph.to_a.select{|x| x.predicate == coll_predicate} | |
for triple in collection_triples | |
collections.append(triple.object.to_s) | |
end | |
end | |
return [model, collections] | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
In OregonDigital's case I think we can strip out the majority of the collection stuff here and count on the metadata to be correct and handle all the association stuff after the model gets saved.