aboutsummaryrefslogtreecommitdiff
path: root/load_copyright.py
diff options
context:
space:
mode:
Diffstat (limited to 'load_copyright.py')
-rw-r--r--load_copyright.py153
1 files changed, 0 insertions, 153 deletions
diff --git a/load_copyright.py b/load_copyright.py
deleted file mode 100644
index 838d907..0000000
--- a/load_copyright.py
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import os
-import sys
-
-import debian.deb822
-import pandas as pd
-import subprocess
-
-#from IPython.core import ultratb
-#sys.excepthook = ultratb.FormattedTB(mode='Verbose',
-# color_scheme='Linux', call_pdb=1)
-
-class BadFormat(Exception):
- pass
-
-def read_copyright(fh):
- paras = debian.deb822.Deb822.iter_paragraphs(fh)
-
- try:
- header = paras.next()
-
- header = dict(header)
- paras = [(p.keys()[0], dict(p)) for p in paras]
- except (KeyError, TypeError, StopIteration):
- raise BadFormat('not in DEP-5 format?')
- return
-
- if 'Format' not in header:
- raise ValueError('no Format field')
- return
-
- files = []
- licences = []
- #file_fields = set([
- # 'Authors', 'Files', 'Comment', 'Copyright', 'Disclaimer', 'Homepage',
- # 'License', 'License-Alias', 'Upstream-Authors', 'X-Comment',
- # 'X-Notes'])
-
- for (type, d) in paras:
- type = type.replace('Licence', 'License')
-
- if 'Licence' in d:
- # !!!
- d['License'] = d['Licence']
- del d['Licence']
-
- if type == 'Files':
- if 'debian' in d['Files']:
- continue
- if 'License' not in d:
- raise ValueError('no license: ' + repr(d))
- return
-
- #keys = set(d.keys())
- #assert keys <= file_fields, keys
- files.append(d)
- elif type == 'License':
- # XXX constrain permissible keys here?
- #print d.keys()
- licences.append(d)
- else:
- # Be conservative. Missing license information is a problem.
- raise ValueError('bad para: ' + type)
-
- return (header, files, licences)
-
-def import_one(pkgname, fh):
- try:
- (header, files, licences) = read_copyright(fh)
- except BadFormat:
- print 'info: not readable'
- return None
- except ValueError, e:
- print 'err:', e
- #print 'err:', repr(e)
- return None
-
- if licences == []:
- return None
- header['_srcpkg'] = pkgname
- header['_license'] = header.get('License', '').split('\n')[0]
-
- if 'Upstream-Name' in header:
- # Make spaces breakable (!).
- # Conceivably other characters need replacing.
- header['Upstream-Name'] = \
- header['Upstream-Name'].replace(u'\xa0', ' ')
- #copy_summary['Upstream-Name'].replace('\xc2\xa0', ' ')
- if '@' in header['Upstream-Name']:
- header['Upstream-Name'] = pkgname
-
- copy_summary = pd.DataFrame([header])
- #print copy_summary.T.to_string()
- #print
-
- for d in files:
- d['_srcpkg'] = pkgname
- d['_license'] = d['License'].split('\n')[0]
-
- for d in licences:
- d['_srcpkg'] = pkgname
- d['_license'] = d['License'].split('\n')[0]
-
- copy_files = pd.DataFrame(files)
- licence = pd.DataFrame(licences)
- return (copy_summary, copy_files, licence)
-
-def get_pkgname(path):
- (dir, base) = os.path.split(path)
-
- if base in ('current', 'stable_copyright'):
- return get_pkgname(dir)
- else:
- return base
-
-def main():
- summaries = []
- files = []
- licenses = []
-
- for path in subprocess.check_output("find metadata.ftp-master.debian.org -name stable_copyright".split()).strip().split():
- pkgname = get_pkgname(path)
- print pkgname, path
- data = import_one(pkgname, file(path))
-
- if data is not None:
- (summary, file_, license) = data
- summaries.append(summary)
- files.append(file_)
- licenses.append(license)
-
- print
-
- summaries = pd.concat(summaries)
- files = pd.concat(files)
- licenses = pd.concat(licenses)
-
- #from IPython import embed
- #embed()
-
- #from IPython.core.debugger import Pdb
- #Pdb().set_trace()
-
- store = pd.HDFStore('cp.h5')
- store['cp_summary'] = summaries
- store['cp_files'] = files
- store['licenses'] = licenses
- store.close()
-
-if __name__ == '__main__':
- main()