aboutsummaryrefslogtreecommitdiff
path: root/load_copyright.py
diff options
context:
space:
mode:
Diffstat (limited to 'load_copyright.py')
-rw-r--r--load_copyright.py130
1 files changed, 130 insertions, 0 deletions
diff --git a/load_copyright.py b/load_copyright.py
new file mode 100644
index 0000000..4016857
--- /dev/null
+++ b/load_copyright.py
@@ -0,0 +1,130 @@
+
+import os
+import sys
+
+import debian.deb822
+import pandas as pd
+
+#from IPython.core import ultratb
+#sys.excepthook = ultratb.FormattedTB(mode='Verbose',
+# color_scheme='Linux', call_pdb=1)
+
+class BadFormat(Exception):
+ pass
+
+def read_copyright(fh):
+ paras = debian.deb822.Deb822.iter_paragraphs(fh)
+
+ try:
+ header = paras.next()
+
+ header = dict(header)
+ paras = [(p.keys()[0], dict(p)) for p in paras]
+ except (KeyError, TypeError, StopIteration):
+ raise BadFormat('not in DEP-5 format?')
+
+ if 'Format' not in header and 'Format-Specification' not in header:
+ raise ValueError('no Format field')
+
+ files = []
+ licences = []
+ file_fields = set([
+ 'Authors', 'Files', 'Comment', 'Copyright', 'Disclaimer', 'Homepage',
+ 'License', 'License-Alias', 'Upstream-Authors', 'X-Comment'])
+
+ for (type, d) in paras:
+ type = type.replace('Licence', 'License')
+
+ if 'Licence' in d:
+ # !!!
+ d['License'] = d['Licence']
+ del d['Licence']
+
+ if type == 'Files':
+ if 'License' not in d:
+ raise ValueError('no license: ' + repr(d))
+
+ keys = set(d.keys())
+ assert keys <= file_fields, keys
+ files.append(d)
+ elif type == 'License':
+ # XXX constrain permissible keys here?
+ #print d.keys()
+ licences.append(d)
+ else:
+ # Be conservative. Missing license information is a problem.
+ raise ValueError('bad para: ' + type)
+
+ return (header, files, licences)
+
+def import_one(pkgname, fh):
+ try:
+ (header, files, licences) = read_copyright(fh)
+ except BadFormat:
+ print 'info: not readable'
+ return None
+ except ValueError, e:
+ print 'err:', e
+ #print 'err:', repr(e)
+ return None
+
+ header['Package'] = pkgname
+ copy_summary = pd.DataFrame([header])
+ #print copy_summary.T.to_string()
+ #print
+
+ for d in files:
+ d['Package'] = pkgname
+ d['_license'] = d['License'].split('\n')[0]
+
+ for d in licences:
+ d['Package'] = pkgname
+ d['_license'] = d['License'].split('\n')[0]
+
+ copy_files = pd.DataFrame(files)
+ licence = pd.DataFrame(licences)
+ return (copy_summary, copy_files, licence)
+
+def get_pkgname(path):
+ (dir, base) = os.path.split(path)
+
+ if base in ('current', 'copyright'):
+ return get_pkgname(dir)
+ else:
+ return base
+
+def main(paths):
+ summaries = []
+ files = []
+ licenses = []
+
+ for path in paths:
+ pkgname = get_pkgname(path)
+ print pkgname, path
+ data = import_one(pkgname, file(path))
+
+ if data is not None:
+ (summary, file_, license) = data
+ summaries.append(summary)
+ files.append(file_)
+ licenses.append(license)
+
+ summaries = pd.concat(summaries)
+ files = pd.concat(files)
+ licenses = pd.concat(licenses)
+
+ #from IPython import embed
+ #embed()
+
+ #from IPython.core.debugger import Pdb
+ #Pdb().set_trace()
+
+ store = pd.HDFStore('cp.h5')
+ store['cp_summary'] = summaries
+ store['cp_files'] = files
+ store['licenses'] = licenses
+ store.close()
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
+