diff options
Diffstat (limited to 'load_copyright.py')
-rw-r--r-- | load_copyright.py | 130 |
1 files changed, 130 insertions, 0 deletions
diff --git a/load_copyright.py b/load_copyright.py new file mode 100644 index 0000000..4016857 --- /dev/null +++ b/load_copyright.py @@ -0,0 +1,130 @@ + +import os +import sys + +import debian.deb822 +import pandas as pd + +#from IPython.core import ultratb +#sys.excepthook = ultratb.FormattedTB(mode='Verbose', +# color_scheme='Linux', call_pdb=1) + +class BadFormat(Exception): + pass + +def read_copyright(fh): + paras = debian.deb822.Deb822.iter_paragraphs(fh) + + try: + header = paras.next() + + header = dict(header) + paras = [(p.keys()[0], dict(p)) for p in paras] + except (KeyError, TypeError, StopIteration): + raise BadFormat('not in DEP-5 format?') + + if 'Format' not in header and 'Format-Specification' not in header: + raise ValueError('no Format field') + + files = [] + licences = [] + file_fields = set([ + 'Authors', 'Files', 'Comment', 'Copyright', 'Disclaimer', 'Homepage', + 'License', 'License-Alias', 'Upstream-Authors', 'X-Comment']) + + for (type, d) in paras: + type = type.replace('Licence', 'License') + + if 'Licence' in d: + # !!! + d['License'] = d['Licence'] + del d['Licence'] + + if type == 'Files': + if 'License' not in d: + raise ValueError('no license: ' + repr(d)) + + keys = set(d.keys()) + assert keys <= file_fields, keys + files.append(d) + elif type == 'License': + # XXX constrain permissible keys here? + #print d.keys() + licences.append(d) + else: + # Be conservative. Missing license information is a problem. + raise ValueError('bad para: ' + type) + + return (header, files, licences) + +def import_one(pkgname, fh): + try: + (header, files, licences) = read_copyright(fh) + except BadFormat: + print 'info: not readable' + return None + except ValueError, e: + print 'err:', e + #print 'err:', repr(e) + return None + + header['Package'] = pkgname + copy_summary = pd.DataFrame([header]) + #print copy_summary.T.to_string() + #print + + for d in files: + d['Package'] = pkgname + d['_license'] = d['License'].split('\n')[0] + + for d in licences: + d['Package'] = pkgname + d['_license'] = d['License'].split('\n')[0] + + copy_files = pd.DataFrame(files) + licence = pd.DataFrame(licences) + return (copy_summary, copy_files, licence) + +def get_pkgname(path): + (dir, base) = os.path.split(path) + + if base in ('current', 'copyright'): + return get_pkgname(dir) + else: + return base + +def main(paths): + summaries = [] + files = [] + licenses = [] + + for path in paths: + pkgname = get_pkgname(path) + print pkgname, path + data = import_one(pkgname, file(path)) + + if data is not None: + (summary, file_, license) = data + summaries.append(summary) + files.append(file_) + licenses.append(license) + + summaries = pd.concat(summaries) + files = pd.concat(files) + licenses = pd.concat(licenses) + + #from IPython import embed + #embed() + + #from IPython.core.debugger import Pdb + #Pdb().set_trace() + + store = pd.HDFStore('cp.h5') + store['cp_summary'] = summaries + store['cp_files'] = files + store['licenses'] = licenses + store.close() + +if __name__ == '__main__': + main(sys.argv[1:]) + |