#!/usr/bin/env python # -*- coding: utf-8 -*- import datetime import itertools import json import os import re import sys import textwrap import pandas as pd import license # Package fields which refer to download locations. download_keys = """ Origin Original-Source Source Source-Code X-Origin X-Original-Package X-Source """ def concat(xss): all = [] for xs in xss: all.extend(xs) return all def indent(s): return ''.join( ' %s\n' % line if line else '\n' for line in s.splitlines()) def today(): return datetime.datetime.now().strftime('%Y-%m-%d') def warn(*x): for s in ('warning:',) + x + ('\n',): print >>sys.stderr, s, class ExportFailure(Exception): pass class PkgData(object): def __init__(self): pkg_store = pd.HDFStore('pkg.h5') self.pkgs = pkg_store['packages'] self.descs = pkg_store['descriptions'] self.srcs = pkg_store['sources'] pkg_store.close() cp_store = pd.HDFStore('cp.h5') self.cpf = cp_store['cp_files'] self.cps = cp_store['cp_summary'] self.licenses = cp_store['licenses'] cp_store.close() cl_store = pd.HDFStore('cl.h5') self.cl = cl_store['cl_versions'] cl_store.close() def nouni(s): return s.encode('utf8') if isinstance(s, unicode) else s class Template(object): def __init__(self, name, values): self.name = name self.values = values def __str__(self): return '{{%s\n%s\n}}' % ( nouni(self.name), '\n'.join(['|' + '%s=%s' % (nouni(n), nouni(v)) for (n, v) in self.values])) def parse_tags(s): return s.replace('\n', '').split(', ') def extract_languages(tags): langs = [] for tag in tags: (a, b) = tag.split('::') if a == 'implemented-in': langs.append(b) elif a == 'devel' and b.startswith('lang:'): langs.append(b.split(':')[1]) return list(set(langs)) def catechise(s): heresies = ["open source", "debian", "(?]+)>', s) if match: return (match.group(1), match.group(2)) else: return (s, '') def extract_people(df): # XXX: extract contributors, maintainers df = df.dropna() if 'Upstream-Contact' in df: (name, email) = parse_person(df['Upstream-Contact']) yield Template('Person', [ ('Real name', name), ('Role', 'contact'), ('Email', email)]) def extract_resources(cp_header): cp_header = cp_header.dropna() for key in re.findall('\S+', download_keys): if key in cp_header: yield Template('Resource', [ ('Resource kind', 'Download'), ('Resource URL', cp_header[key])]) def export_srcpkgs(data, name, srcpkg_names): """Export a package by reference to its constituent source packages. This coordinates all the information that goes into a particular page. """ # Map source package names to binary packages, and also make note # of which versions of those source packages we're looking at. binpkgs = pd.concat([ data.pkgs[data.pkgs['_srcpkg'] == srcpkg] for srcpkg in srcpkg_names]) versions = {} srcfiles="" for srcpkg in srcpkg_names: srcfile = data.srcs[data.srcs['_srcpkg'] == srcpkg]['srcfile'].values[0] letter = srcfile[0] if srcfile[:3] == 'lib': letter = srcfile[:4] srcfile = 'http://ftp.debian.org/debian/pool/main/%s/%s/%s' % (letter,srcpkg,srcfile) srcfiles = srcfiles + srcfile + " " for (_i, pkg) in binpkgs.iterrows(): versions[pkg['_srcpkg']] = pkg['Version'] if len(binpkgs) == 0: warn('no binary packages found for', srcpkg_names) return binpkg_names = sorted(binpkgs['Package'], key=len) homepages = list(binpkgs['Homepage']) # XXX: maybe choose the one that appears the most? homepage = homepages[0] if homepages else '' tags = set(concat( [parse_tags(t) for t in binpkgs['Tag'] if not pd.isnull(t)])) langs = [s.title() for s in extract_languages(tags)] if name in binpkg_names: descpkg = name else: # Heuristic: choose the package with the shortest name. # We could try to do something smarter, like look for the common # prefix of the descriptions of all the binary packages. descpkg = binpkg_names[0] try: desc = list(data.descs[ data.descs['Package'] == descpkg]['Description-en'])[0] (short_desc, full_desc) = desc.split('\n', 1) except: full_desc = '' short_desc = '' full_desc = catechise(munge_description(full_desc)) yield Template('Entry', [ ('Name', name.capitalize()), ('Short description', short_desc), ('Full description', full_desc), ('Homepage URL', homepage), ('User level', ''), # XXX get this information from apt-file ('Component programs', ''), ('VCS checkout command', ''), ('Computer languages', ', '.join(langs)), ('Status', ''), ('Is GNU', 'No'), ('Version identifier', pkg['Version']), ('Version download', srcfiles), ('Submitted by', 'Debian import'), ('Submitted date', today())]) yield Template('Import', [ ('Source', 'Debian'), ('Source link', 'http://packages.debian.org/sid/' + srcpkg_names[0]), ('Source packages', ', '.join('%s %s' % (k, v) for (k, v) in versions.iteritems())), ('Date', today())]) people = [] res = [] for srcpkg in srcpkg_names: pkg_cps = data.cps[data.cps['_srcpkg'] == srcpkg].ix[0] pkg_cpf = data.cpf[data.cpf['_srcpkg'] == srcpkg] pkg_licenses = data.licenses[data.licenses['_srcpkg'] == srcpkg] people.extend(list(extract_people(pkg_cps))) res.extend(list(extract_resources(pkg_cps))) pkg_cl = data.cl[data.cl['_srcpkg'] == srcpkg] cl_date = pkg_cl['date'] # avoid rare bad index error: if cl_date.empty: continue cl_date = cl_date[0] cl_uploader = pkg_cl['author'][0] for template in srcpkg_extract_licenses( pkg_cps, pkg_cpf, pkg_licenses, cl_date, cl_uploader): # XXX: eliminate duplicates yield template for template in people: # XXX: eliminate duplicates yield template for template in res: # XXX: eliminate duplicates yield template #yield Template('Software category', [ # ('Resource kind', ''), # ('Resource URL', '')]) def filename(s): s_ = re.sub('[^A-Za-z0-9_+.-]', '_', s) assert s_, s return s_ + '.wiki' def output(path, xs): with open(path, 'w') as f: for x in xs: f.write(str(x) + '\n') def output_multi(path, xs): "Output a bunch of pages to a directory with an index." index = {} if not os.path.exists(path): os.makedirs(path) for (name, templates) in xs: fname = filename(name) fpath = os.path.join(path, fname) index[fname] = {'page': name, 'file': fname} output(fpath, templates) fpath = os.path.join(path, 'index.json') json.dump(index, file(fpath, 'w')) def uname_srcpkgs(data, name): pkg_cps = data.cps[data.cps['Upstream-Name'] == name] srcpkg_names = list(pkg_cps['_srcpkg']) return srcpkg_names def export_all(data): """Export all packages. Returns a generator of (name, templates) tuples. """ # First, find all upstream names and the source packages corresponding # to them. unames = sorted(set(data.cps['Upstream-Name'].dropna())) # For source packages with no upstream name, use the source package # name as the upstream name. no_uname = sorted(set(data.cps[ data.cps['Upstream-Name'].isnull()]['_srcpkg'])) packages = itertools.chain( ((uname, uname_srcpkgs(data, uname)) for uname in unames), ((srcpkg, [srcpkg]) for srcpkg in no_uname)) for (name, srcpkgs) in packages: if not name: continue if '\n' in name: # Seriously? warn('bad name: %r' % name) continue # Generator; exceptions are delayed. templates = export_srcpkgs(data, name, srcpkgs) yield (name, templates) def export_all_to_directory(data, outputdir): def _export(): for (name, templates) in export_all(data): print (name.encode('utf8') if isinstance(name, unicode) else name) try: # Force errors. templates = list(templates) except ExportFailure, e: warn('export failed: %s: %s' % (name.encode('utf-8').strip(), e.message.encode('utf-8').strip())) yield(name, templates) output_multi(outputdir, _export()) def main(): data = PkgData() args = sys.argv[1:] if len(args) == 0: export_all_to_directory(data, 'output') elif len(args) == 1: # XXX: assumes argument is an upstream name uname = args[0] srcpkgs = uname_srcpkgs(data, uname) templates = export_srcpkgs(data, uname, srcpkgs) for template in templates: print template else: raise RuntimeError() if __name__ == '__main__': main()