diff options
Diffstat (limited to 'export.py')
-rw-r--r-- | export.py | 419 |
1 files changed, 0 insertions, 419 deletions
diff --git a/export.py b/export.py deleted file mode 100644 index bf45e5e..0000000 --- a/export.py +++ /dev/null @@ -1,419 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import datetime -import itertools -import json -import os -import re -import sys -import textwrap - -import pandas as pd - -import license - -# Package fields which refer to download locations. -download_keys = """ - Origin - Original-Source - Source - Source-Code - X-Origin - X-Original-Package - X-Source - """ - -def concat(xss): - all = [] - - for xs in xss: - all.extend(xs) - - return all - -def indent(s): - return ''.join( - ' %s\n' % line if line else '\n' - for line in s.splitlines()) - -def today(): - return datetime.datetime.now().strftime('%Y-%m-%d') - -def warn(*x): - for s in ('warning:',) + x + ('\n',): - print >>sys.stderr, s, - -class ExportFailure(Exception): - pass - -class PkgData(object): - def __init__(self): - pkg_store = pd.HDFStore('pkg.h5') - self.pkgs = pkg_store['packages'] - self.descs = pkg_store['descriptions'] - self.srcs = pkg_store['sources'] - pkg_store.close() - - cp_store = pd.HDFStore('cp.h5') - self.cpf = cp_store['cp_files'] - self.cps = cp_store['cp_summary'] - self.licenses = cp_store['licenses'] - cp_store.close() - - cl_store = pd.HDFStore('cl.h5') - self.cl = cl_store['cl_versions'] - cl_store.close() - -def nouni(s): - return s.encode('utf8') if isinstance(s, unicode) else s - -class Template(object): - def __init__(self, name, values): - self.name = name - self.values = values - - def __str__(self): - return '{{%s\n%s\n}}' % ( - nouni(self.name), - '\n'.join(['|' + '%s=%s' % - (nouni(n), nouni(v)) - for (n, v) in self.values])) - -def parse_tags(s): - return s.replace('\n', '').split(', ') - -def extract_languages(tags): - langs = [] - - for tag in tags: - (a, b) = tag.split('::') - - if a == 'implemented-in': - langs.append(b) - elif a == 'devel' and b.startswith('lang:'): - langs.append(b.split(':')[1]) - - return list(set(langs)) - -def catechise(s): - heresies = ["open source", "debian", "(?<!gnu/)linux", "creative commons"] - pattern = '\\b(%s)\\b' % '|'.join([h.replace(' ', '.') for h in heresies]) - return re.sub(pattern, - lambda m: '??%s??' % m.group(1).replace('\n', ' '), - s, - re.DOTALL | re.IGNORECASE) - -def munge_description(s): - paras = s.split('\n .\n') - return '\n\n'.join( - textwrap.fill(para.lstrip().replace('\n', ''), 65) - for para in paras) - -def get_license_map(): - map = {} - - for para in file('license_map').read().split('\n\n'): - if not para: - continue - - match = re.match('\[([^\]]+)\]', para) - assert match, para - canonical = match.group(1) - aliases = para[match.end():].lstrip().splitlines() - - for alias in aliases: - map[alias] = canonical - - return map - -def srcpkg_extract_licenses(header, filess, licenses, cl_date, cl_uploader): - # XXX: generate template from header stanza - # XXX: flag CC licenses - # XXX: check all License stanzas were included - # XXX: exclude licenses for Files: debian/* - lmap = get_license_map() - by_name = dict([ - (s['_license'], - s['License'].split('\n', 1)[1] - if '\n' in s['License'] - else s['License']) - for (_idx, s) in licenses.iterrows()]) - - for (_ix, files) in filess.iterrows(): - ldesc = files['_license'].strip().lower() - ltext = files['License'] - - if '\n' in ltext: - # Looks like license text is included directly. - ltext = munge_description(ltext) - txt = 'License: %s\n\n%s' % (ldesc, ltext) - elif ldesc in by_name: - # License information is a stub. Try to find the corresponding - # text(s). - - ltext = munge_description(by_name[ldesc]) - txt = 'License: %s\n\n%s' % (ldesc, ltext) - else: - parsed = license.parse_licenses(ldesc) - lnames = list(parsed.flatten()) - missing = set(lnames) - set(by_name.keys()) - - if missing: - txt = 'License: %s' % (parsed) - - canon = lmap.get(ldesc.lower(), 'Other') - # XXX: Should maybe bail if there's no copyright field. - cp = ''.join( - u'Copyright %s' % line.lstrip() - for line in files.dropna().get('Copyright', '').splitlines()) - cp = cp.encode('utf8') - txt = txt.encode('utf8') - - yield Template('Project license', [ - ('License', canon), - ('License copyright', cp), - ('License verified by', 'Debian: %s' % cl_uploader), - ('License verified date', cl_date), - ('License note', txt)]) - -def parse_person(s): - match = re.match('([^<]+)\s+<([^>]+)>', s) - - if match: - return (match.group(1), match.group(2)) - else: - return (s, '') - -def extract_people(df): - # XXX: extract contributors, maintainers - df = df.dropna() - - if 'Upstream-Contact' in df: - (name, email) = parse_person(df['Upstream-Contact']) - yield Template('Person', [ - ('Real name', name), - ('Role', 'contact'), - ('Email', email)]) - -def extract_resources(cp_header): - cp_header = cp_header.dropna() - - for key in re.findall('\S+', download_keys): - if key in cp_header: - yield Template('Resource', [ - ('Resource kind', 'Download'), - ('Resource URL', cp_header[key])]) - -def export_srcpkgs(data, name, srcpkg_names): - """Export a package by reference to its constituent source packages. - - This coordinates all the information that goes into a particular page. - """ - - # Map source package names to binary packages, and also make note - # of which versions of those source packages we're looking at. - binpkgs = pd.concat([ - data.pkgs[data.pkgs['_srcpkg'] == srcpkg] - for srcpkg in srcpkg_names]) - versions = {} - - srcfiles="" - for srcpkg in srcpkg_names: - srcfile = data.srcs[data.srcs['_srcpkg'] == srcpkg]['srcfile'].values[0] - letter = srcfile[0] - if srcfile[:3] == 'lib': - letter = srcfile[:4] - srcfile = 'http://ftp.debian.org/debian/pool/main/%s/%s/%s' % (letter,srcpkg,srcfile) - srcfiles = srcfiles + srcfile + " " - - for (_i, pkg) in binpkgs.iterrows(): - versions[pkg['_srcpkg']] = pkg['Version'] - - if len(binpkgs) == 0: - warn('no binary packages found for', srcpkg_names) - return - - binpkg_names = sorted(binpkgs['Package'], key=len) - homepages = list(binpkgs['Homepage']) - # XXX: maybe choose the one that appears the most? - homepage = homepages[0] if homepages else '' - tags = set(concat( - [parse_tags(t) for t in binpkgs['Tag'] if not pd.isnull(t)])) - langs = [s.title() for s in extract_languages(tags)] - - if name in binpkg_names: - descpkg = name - else: - # Heuristic: choose the package with the shortest name. - # We could try to do something smarter, like look for the common - # prefix of the descriptions of all the binary packages. - descpkg = binpkg_names[0] - try: - desc = list(data.descs[ - data.descs['Package'] == descpkg]['Description-en'])[0] - (short_desc, full_desc) = desc.split('\n', 1) - except: - full_desc = '' - short_desc = '' - full_desc = catechise(munge_description(full_desc)) - - yield Template('Entry', [ - ('Name', name.capitalize()), - ('Short description', short_desc), - ('Full description', full_desc), - ('Homepage URL', homepage), - ('User level', ''), - # XXX get this information from apt-file - ('Component programs', ''), - ('VCS checkout command', ''), - ('Computer languages', ', '.join(langs)), - ('Status', ''), - ('Is GNU', 'No'), - ('Version identifier', pkg['Version']), - ('Version download', srcfiles), - ('Submitted by', 'Debian import'), - ('Submitted date', today())]) - - yield Template('Import', [ - ('Source', 'Debian'), - ('Source link', - 'http://packages.debian.org/sid/' + srcpkg_names[0]), - ('Source packages', - ', '.join('%s %s' % (k, v) for (k, v) in versions.iteritems())), - ('Date', today())]) - - people = [] - res = [] - - for srcpkg in srcpkg_names: - pkg_cps = data.cps[data.cps['_srcpkg'] == srcpkg].ix[0] - pkg_cpf = data.cpf[data.cpf['_srcpkg'] == srcpkg] - pkg_licenses = data.licenses[data.licenses['_srcpkg'] == srcpkg] - people.extend(list(extract_people(pkg_cps))) - res.extend(list(extract_resources(pkg_cps))) - - pkg_cl = data.cl[data.cl['_srcpkg'] == srcpkg] - cl_date = pkg_cl['date'] - # avoid rare bad index error: - if cl_date.empty: - continue - cl_date = cl_date[0] - cl_uploader = pkg_cl['author'][0] - for template in srcpkg_extract_licenses( - pkg_cps, pkg_cpf, pkg_licenses, cl_date, cl_uploader): - # XXX: eliminate duplicates - yield template - - for template in people: - # XXX: eliminate duplicates - yield template - - for template in res: - # XXX: eliminate duplicates - yield template - - #yield Template('Software category', [ - # ('Resource kind', ''), - # ('Resource URL', '')]) - -def filename(s): - s_ = re.sub('[^A-Za-z0-9_+.-]', '_', s) - assert s_, s - return s_ + '.wiki' - -def output(path, xs): - with open(path, 'w') as f: - for x in xs: - f.write(str(x) + '\n') - -def output_multi(path, xs): - "Output a bunch of pages to a directory with an index." - - index = {} - - if not os.path.exists(path): - os.makedirs(path) - - for (name, templates) in xs: - fname = filename(name) - fpath = os.path.join(path, fname) - index[fname] = {'page': name, 'file': fname} - output(fpath, templates) - - fpath = os.path.join(path, 'index.json') - json.dump(index, file(fpath, 'w')) - -def uname_srcpkgs(data, name): - pkg_cps = data.cps[data.cps['Upstream-Name'] == name] - srcpkg_names = list(pkg_cps['_srcpkg']) - return srcpkg_names - -def export_all(data): - """Export all packages. - - Returns a generator of (name, templates) tuples. - """ - - # First, find all upstream names and the source packages corresponding - # to them. - - unames = sorted(set(data.cps['Upstream-Name'].dropna())) - - # For source packages with no upstream name, use the source package - # name as the upstream name. - - no_uname = sorted(set(data.cps[ - data.cps['Upstream-Name'].isnull()]['_srcpkg'])) - - packages = itertools.chain( - ((uname, uname_srcpkgs(data, uname)) for uname in unames), - ((srcpkg, [srcpkg]) for srcpkg in no_uname)) - - for (name, srcpkgs) in packages: - if not name: - continue - - if '\n' in name: - # Seriously? - warn('bad name: %r' % name) - continue - - # Generator; exceptions are delayed. - templates = export_srcpkgs(data, name, srcpkgs) - yield (name, templates) - -def export_all_to_directory(data, outputdir): - def _export(): - for (name, templates) in export_all(data): - print (name.encode('utf8') if isinstance(name, unicode) else name) - - try: - # Force errors. - templates = list(templates) - except ExportFailure, e: - warn('export failed: %s: %s' % (name.encode('utf-8').strip(), e.message.encode('utf-8').strip())) - - yield(name, templates) - - output_multi(outputdir, _export()) - -def main(): - data = PkgData() - args = sys.argv[1:] - - if len(args) == 0: - export_all_to_directory(data, 'output') - elif len(args) == 1: - # XXX: assumes argument is an upstream name - uname = args[0] - srcpkgs = uname_srcpkgs(data, uname) - templates = export_srcpkgs(data, uname, srcpkgs) - - for template in templates: - print template - else: - raise RuntimeError() - -if __name__ == '__main__': - main() |