#!/usr/bin/env python # -*- coding: utf-8 -*- import os import sys import debian.deb822 import pandas as pd import subprocess #from IPython.core import ultratb #sys.excepthook = ultratb.FormattedTB(mode='Verbose', # color_scheme='Linux', call_pdb=1) class BadFormat(Exception): pass def read_copyright(fh): paras = debian.deb822.Deb822.iter_paragraphs(fh) try: header = paras.next() header = dict(header) paras = [(p.keys()[0], dict(p)) for p in paras] except (KeyError, TypeError, StopIteration): raise BadFormat('not in DEP-5 format?') return if 'Format' not in header: raise ValueError('no Format field') return files = [] licences = [] #file_fields = set([ # 'Authors', 'Files', 'Comment', 'Copyright', 'Disclaimer', 'Homepage', # 'License', 'License-Alias', 'Upstream-Authors', 'X-Comment', # 'X-Notes']) for (type, d) in paras: type = type.replace('Licence', 'License') if 'Licence' in d: # !!! d['License'] = d['Licence'] del d['Licence'] if type == 'Files': if 'debian' in d['Files']: continue if 'License' not in d: raise ValueError('no license: ' + repr(d)) return #keys = set(d.keys()) #assert keys <= file_fields, keys files.append(d) elif type == 'License': # XXX constrain permissible keys here? #print d.keys() licences.append(d) else: # Be conservative. Missing license information is a problem. raise ValueError('bad para: ' + type) return (header, files, licences) def import_one(pkgname, fh): try: (header, files, licences) = read_copyright(fh) except BadFormat: print 'info: not readable' return None except ValueError, e: print 'err:', e #print 'err:', repr(e) return None if licences == []: return None header['_srcpkg'] = pkgname header['_license'] = header.get('License', '').split('\n')[0] if 'Upstream-Name' in header: # Make spaces breakable (!). # Conceivably other characters need replacing. header['Upstream-Name'] = \ header['Upstream-Name'].replace(u'\xa0', ' ') #copy_summary['Upstream-Name'].replace('\xc2\xa0', ' ') if '@' in header['Upstream-Name']: header['Upstream-Name'] = pkgname copy_summary = pd.DataFrame([header]) #print copy_summary.T.to_string() #print for d in files: d['_srcpkg'] = pkgname d['_license'] = d['License'].split('\n')[0] for d in licences: d['_srcpkg'] = pkgname d['_license'] = d['License'].split('\n')[0] copy_files = pd.DataFrame(files) licence = pd.DataFrame(licences) return (copy_summary, copy_files, licence) def get_pkgname(path): (dir, base) = os.path.split(path) if base in ('current', 'stable_copyright'): return get_pkgname(dir) else: return base def main(): summaries = [] files = [] licenses = [] for path in subprocess.check_output("find metadata.ftp-master.debian.org -name stable_copyright".split()).strip().split(): pkgname = get_pkgname(path) print pkgname, path data = import_one(pkgname, file(path)) if data is not None: (summary, file_, license) = data summaries.append(summary) files.append(file_) licenses.append(license) print summaries = pd.concat(summaries) files = pd.concat(files) licenses = pd.concat(licenses) #from IPython import embed #embed() #from IPython.core.debugger import Pdb #Pdb().set_trace() store = pd.HDFStore('cp.h5') store['cp_summary'] = summaries store['cp_files'] = files store['licenses'] = licenses store.close() if __name__ == '__main__': main()