From: Fredrik Unger Date: Sun, 3 Apr 2011 19:26:57 +0000 (+0200) Subject: Initial refactoring to use objects. Simple trie implementation to better X-Git-Tag: v1.0~23 X-Git-Url: https://source.tree.se/git?p=treecutter.git;a=commitdiff_plain;h=b73ab8db3f73bbdcd88746151f5e0d4a3d8960a1 Initial refactoring to use objects. Simple trie implementation to better support the sitemap structure. Initial tests of the objects but still keeping the initial working code. --- diff --git a/src/tree-cutter.py b/src/tree-cutter.py index 351f5d4..97ac6a2 100755 --- a/src/tree-cutter.py +++ b/src/tree-cutter.py @@ -9,6 +9,7 @@ import errno import time import argparse import shutil +import pygraphviz as pgv from amara import bindery from amara.xslt import transform from Cheetah.Template import Template @@ -41,6 +42,125 @@ def publish(src,target): if retcode: print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + +PREFIXES={u'db': u'http://docbook.org/ns/docbook', + u'xi': u'http://www.w3.org/2001/XInclude', + u'xl': u'http://www.w3.org/1999/xlink'} + +class Directory(): + """Class containing the state of the directory with articles""" + def __init__(self): + self._cwd = '.' + self._tree = [] + + def scan(self): + for dirname, dirnames, filenames in os.walk(self._cwd): + for filename in filenames: + if fnmatch.fnmatch(filename, '*.xml'): + file_ = os.path.join(dirname,filename) + doc = bindery.parse(file_, prefixes=PREFIXES) + title = doc.xml_select(u'/db:article/db:info/db:title') + menu = doc.xml_select(u'/db:article/db:info/db:titleabbrev') + if title and menu: + base = file_.split('.')[1] + link = base.replace('index','') + self._tree.append(link) + + def set(self): + return set(self._tree) + +class Page(): + """Class representing a webpage on the site""" + def __init__(self,link): + self._link = link + self._resources = [] + self._script = 0 + + def link(self): + return self._link + +class Node(): + def __init__(self,token,value): + self._token = token + self._value = value + self._children = [] + + def token(self): + return self._token + + def value(self): + return self._value + + def children(self): + return self._children + +class Trie(): + def __init__(self): + self._root = [] + + def _add(self,trie, key, content): + # is the key a leaf + k = key.pop(0) + if key == []: + node = Node(k,content) + trie.append(node) + else: + for ch in trie: + if ch.token() == k: + self._add(ch.children(), key, content) + + def add(self,key, content): + self._add(self._root, key, content) + + def _graph(self, trie, G): + for l in trie: + G.add_node(l.token()) + for ch in l.children(): + G.add_edge(l.token(),ch.token()) + self._graph(l.children(), G) + + def graph(self): + G = pgv.AGraph(directed=True) + G.add_node("sitemap") + for ch in self._root: + G.add_edge("sitemap",ch.token()) + self._graph(self._root, G) +# G.layout('dot') +# G.draw('g.png') + print G.string() + +class Sitemap(): + """Class keeping the internal site structure""" + def __init__(self): + self._file = 'sitemap.txt' + self._pages = [] + self._tree = Trie() + + def add_page(self, link): + page = Page(link) + self._pages.append(page) + tokens = filter(None,re.split(r'(^/\w*/|\w*/)',link)) + self._tree.add(tokens,page) + + def read_map(self): + try: + f = open(self._file) + sml = f.read().split() + f.close() + for line in sml: + self.add_page(line) + except IOError, what_error: + print 'INFO: Could not read sitemap.txt - one will be created' + + def set(self): + return set(page.link() for page in self._pages) + + def pages(self): + return self._pages + + def graph(self): + self._tree.graph() + def generateSitemap(): sitemap = [] try: @@ -223,6 +343,22 @@ def createSitemap(sitemap): out.write(str(template)) out.close() +dir_ = Directory() +sitemap = Sitemap() + +dir_.scan() +sitemap.read_map() + +missing = dir_.set() - sitemap.set() +removed = sitemap.set() - dir_.set() +for page in removed: + print removed+' pages missing!!' + +for page in missing: + print 'adding missing page '+page + sitemap.add_page(page) + +sitemap.graph() sitemap = generateSitemap()