diff options
Diffstat (limited to 'load_copyright.py')
-rw-r--r-- | load_copyright.py | 153 |
1 files changed, 0 insertions, 153 deletions
diff --git a/load_copyright.py b/load_copyright.py deleted file mode 100644 index 838d907..0000000 --- a/load_copyright.py +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import os -import sys - -import debian.deb822 -import pandas as pd -import subprocess - -#from IPython.core import ultratb -#sys.excepthook = ultratb.FormattedTB(mode='Verbose', -# color_scheme='Linux', call_pdb=1) - -class BadFormat(Exception): - pass - -def read_copyright(fh): - paras = debian.deb822.Deb822.iter_paragraphs(fh) - - try: - header = paras.next() - - header = dict(header) - paras = [(p.keys()[0], dict(p)) for p in paras] - except (KeyError, TypeError, StopIteration): - raise BadFormat('not in DEP-5 format?') - return - - if 'Format' not in header: - raise ValueError('no Format field') - return - - files = [] - licences = [] - #file_fields = set([ - # 'Authors', 'Files', 'Comment', 'Copyright', 'Disclaimer', 'Homepage', - # 'License', 'License-Alias', 'Upstream-Authors', 'X-Comment', - # 'X-Notes']) - - for (type, d) in paras: - type = type.replace('Licence', 'License') - - if 'Licence' in d: - # !!! - d['License'] = d['Licence'] - del d['Licence'] - - if type == 'Files': - if 'debian' in d['Files']: - continue - if 'License' not in d: - raise ValueError('no license: ' + repr(d)) - return - - #keys = set(d.keys()) - #assert keys <= file_fields, keys - files.append(d) - elif type == 'License': - # XXX constrain permissible keys here? - #print d.keys() - licences.append(d) - else: - # Be conservative. Missing license information is a problem. - raise ValueError('bad para: ' + type) - - return (header, files, licences) - -def import_one(pkgname, fh): - try: - (header, files, licences) = read_copyright(fh) - except BadFormat: - print 'info: not readable' - return None - except ValueError, e: - print 'err:', e - #print 'err:', repr(e) - return None - - if licences == []: - return None - header['_srcpkg'] = pkgname - header['_license'] = header.get('License', '').split('\n')[0] - - if 'Upstream-Name' in header: - # Make spaces breakable (!). - # Conceivably other characters need replacing. - header['Upstream-Name'] = \ - header['Upstream-Name'].replace(u'\xa0', ' ') - #copy_summary['Upstream-Name'].replace('\xc2\xa0', ' ') - if '@' in header['Upstream-Name']: - header['Upstream-Name'] = pkgname - - copy_summary = pd.DataFrame([header]) - #print copy_summary.T.to_string() - #print - - for d in files: - d['_srcpkg'] = pkgname - d['_license'] = d['License'].split('\n')[0] - - for d in licences: - d['_srcpkg'] = pkgname - d['_license'] = d['License'].split('\n')[0] - - copy_files = pd.DataFrame(files) - licence = pd.DataFrame(licences) - return (copy_summary, copy_files, licence) - -def get_pkgname(path): - (dir, base) = os.path.split(path) - - if base in ('current', 'stable_copyright'): - return get_pkgname(dir) - else: - return base - -def main(): - summaries = [] - files = [] - licenses = [] - - for path in subprocess.check_output("find metadata.ftp-master.debian.org -name stable_copyright".split()).strip().split(): - pkgname = get_pkgname(path) - print pkgname, path - data = import_one(pkgname, file(path)) - - if data is not None: - (summary, file_, license) = data - summaries.append(summary) - files.append(file_) - licenses.append(license) - - print - - summaries = pd.concat(summaries) - files = pd.concat(files) - licenses = pd.concat(licenses) - - #from IPython import embed - #embed() - - #from IPython.core.debugger import Pdb - #Pdb().set_trace() - - store = pd.HDFStore('cp.h5') - store['cp_summary'] = summaries - store['cp_files'] = files - store['licenses'] = licenses - store.close() - -if __name__ == '__main__': - main() |