Adding a simple iterator to the Trie class, now the set operations
[treecutter.git] / src / tree-cutter.py
1 #!/usr/bin/python
2 import os
3 import fnmatch
4 import subprocess
5 import amara
6 import re
7 import tempfile
8 import errno
9 import time
10 import argparse
11 import shutil
12 import pygraphviz as pgv
13 import glob
14 from amara import bindery
15 from amara.xslt import transform
16 from Cheetah.Template import Template
17
18 parser = argparse.ArgumentParser(description='Process docbook article tree.')
19 parser.add_argument('--style', nargs='?',
20                     default=os.path.dirname(os.getcwd())+'/style/default/')
21 parser.add_argument('--output', nargs='?',
22                     default=os.path.dirname(os.getcwd())+'/htdocs/')
23 args = parser.parse_args()
24
25 style_xslt = args.style+"docbook.xsl"
26 style_tmpl = args.style+"index.en.html.tmpl"
27 outputdir = args.output
28
29 valid_scripts = ['.py','.pl']
30 MAXLEVEL = 10000
31
32 def mkdir_p(path):
33     try:
34         os.makedirs(path)
35     except OSError as exc: # Python >2.5
36         if exc.errno == errno.EEXIST:
37             pass
38         else: raise
39
40 def publish(src,target):
41     cmd = ["rsync","-a","--delete",src,target]
42     retcode = subprocess.call(cmd)
43     if retcode:
44         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
45
46
47 PREFIXES={u'db': u'http://docbook.org/ns/docbook',
48           u'xi': u'http://www.w3.org/2001/XInclude',
49           u'xl': u'http://www.w3.org/1999/xlink'}
50
51 class Directory():
52     """Class containing the state of the directory with articles"""
53     def __init__(self):
54         self._cwd = '.'
55         self._tree = []
56
57     def scan(self):
58         for dirname, dirnames, filenames in os.walk(self._cwd):
59             for filename in filenames:
60                 if fnmatch.fnmatch(filename, '*.xml'):
61                     file_ = os.path.join(dirname,filename)
62                     doc = bindery.parse(file_, prefixes=PREFIXES)
63                     title = doc.xml_select(u'/db:article/db:info/db:title')
64                     menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
65                     if title and menu:
66                         base = file_.split('.')[1]
67                         link = base.replace('index','')
68                         self._tree.append(link)
69
70     def set(self):
71         return set(self._tree)
72
73 class Page():
74     """Class representing a version of a webpage"""
75     def __init__(self,page):
76         self._file = page[1]
77         self._lang = page[0]
78         self._doc = None
79         self._resources = []
80         self._title = None
81         self._menu = None
82         self._rendered_article = None
83
84     def prepare(self):
85         self._doc = bindery.parse(self._file, prefixes=PREFIXES)
86         if self._doc.xml_select(u'/db:article/db:info/db:title'):
87             self._title = unicode(doc.article.info.title)
88         if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'):
89             self._menu = unicode(doc.article.info.titleabbrev)
90
91         dirname = os.path.dirname(self._file)
92         code  = self._doc.xml_select(u"//xi:include[@parse='text']")
93         if code:
94             for c in code:
95                 (p, ext) = os.path.splitext(c.href)
96                 if ext in valid_scripts:
97                     exe = os.path.join(os.path.abspath(dirname+c.href))
98                     xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
99                     xstr = bindery.parse(str(xml.stdout.read()))
100                     idp = c.xml_index_on_parent
101                     for x in xstr.xml_children:
102                         c.xml_parent.xml_insert(idp,x)
103                         c.xml_parent.xml_remove(c)
104
105         for r in self._doc.xml_select(u"//db:link[@xl:href]"):
106             rf = os.path.join(dirname,r.href)
107             if os.path.isfile(rf):
108                 self._resources.append(rf)
109         for i in self._doc.xml_select(u"//db:imagedata[@fileref]"):
110             im = os.path.join(dirname,i.fileref)
111             if os.path.isfile(im):
112                 self._resources.append(im)
113
114     def render(self):
115         #  amara can not handle the docbook stylesheets
116         #  xmlarticle = transform(doc,style_xslt)
117         cwd = os.getcwd()
118         dirname = os.path.dirname(self._file)
119         os.chdir(dirname)
120         infile  = os.path.basename(tempfile.mktemp())
121         outfile = tempfile.mktemp()
122         tfi = open(infile,'w')
123         tfi.write(doc.xml_encode())
124         tfi.close()
125 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
126         cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
127         retcode = subprocess.call(cmd)
128         if retcode:
129             print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
130         tfo = open(outfile,'r')
131         self._rendered_article = tfo.read()
132         tfo.close()
133         os.remove(infile)
134         os.remove(outfile)
135         os.chdir(cwd)
136
137     def template(self,sitemap):
138         htmlmenu =  sitemap.gen_menu(self._lang,None)
139         levelmenu = sitemap.gen_menu(self._lang,self)
140         template = Template(file=style_tmpl,
141                             searchList=[{'title':self._title},
142                                         {'menu':htmlmenu},
143                                         {'article':self._rendered_article},
144                                         {'levelmenu':levelmenu},
145                                         {'levelname':levelname}])
146         outfile = tmptarget+self._file+'.'+self._lang+'.html'
147         mkdir_p(os.path.dirname(outfile))
148         out = open(outfile, 'w')
149         out.write(str(template))
150         out.close()
151
152
153 class Link():
154     """Class representing a webpage on the site"""
155     def __init__(self,link):
156         self._link = link
157         # find the representations of the link.
158         self._pages = []
159         path = link
160         if self._link[-1] == '/':
161             path = path+'index'
162         lang = self._scan_languages(path)
163         for l in lang:
164             self._pages.append(Page(l))
165
166     def _scan_languages(self,path):
167         lang = []
168         for l in  glob.glob('.'+path+'*'):
169             ls = l.split('.')
170             if len(ls) > 3 and ls[3] == 'xml':
171                 lang.append((ls[2],l))
172         return lang
173
174     def link(self):
175         return self._link
176
177 class Node():
178     def __init__(self,token,value):
179         self._token = token
180         self._value = value
181         self._children = []
182
183     def token(self):
184         return self._token
185
186     def value(self):
187         return self._value
188
189     def children(self):
190         return self._children
191
192 class Trie():
193     def __init__(self):
194         self._root = []
195
196     def __iter__(self):
197         return self.inorder(self._root)
198
199     def inorder(self,t):
200         for l in t:
201             yield l.value()
202             for ch in l.children():
203                 self.inorder(ch)
204
205     def _add(self,trie, key, content):
206         # is the key a leaf
207         k = key.pop(0)
208         if key == []:
209             node = Node(k,content)
210             trie.append(node)
211         else:
212             for ch in trie:
213                 if ch.token() == k:
214                     self._add(ch.children(), key, content)
215
216     def add(self,key, content):
217         self._add(self._root, key, content)
218
219     def _graph(self, trie, G):
220         for l in trie:
221             G.add_node(l.token())
222             for ch in l.children():
223                 G.add_edge(l.token(),ch.token())
224                 self._graph(l.children(), G)
225
226     def graph(self):
227         G = pgv.AGraph(directed=True)
228         G.add_node("sitemap")
229         for ch in self._root:
230             G.add_edge("sitemap",ch.token())
231         self._graph(self._root, G)
232 #        G.layout('dot')
233 #        G.draw('g.png')
234         print G.string()
235
236 class Sitemap():
237     """Class keeping the internal site structure"""
238     def __init__(self):
239         self._file = 'sitemap.txt'
240         self._tree = Trie()
241
242     def add_link(self, link):
243         tokens = filter(None,re.split(r'(^/\w*/|\w*/)',link))
244         self._tree.add(tokens,Link(link))
245
246     def read_map(self):
247         try:
248             f = open(self._file)
249             sml = f.read().split()
250             f.close()
251             for line in sml:
252                 self.add_link(line)
253         except IOError, what_error:
254             print 'INFO: Could not read sitemap.txt - one will be created'
255
256     def set(self):
257         return set(link.link() for link in self._tree)
258
259     def graph(self):
260         self._tree.graph()
261
262     def gen_menu(self,lang,page):
263         return 'Generate menu from sitemap - To be implemented'
264
265 def generateSitemap():
266     sitemap = []
267     try:
268         sfile = open('sitemap.txt')
269         flist = sfile.read().split()
270         sfile.close()
271         for f in flist:
272             sitemap.append(dict(link=f))
273     except IOError, what_error:
274         print 'Sitemap missing - generating one.'
275
276     for dirname, dirnames, filenames in os.walk('.'):
277         for filename in filenames:
278             if fnmatch.fnmatch(filename, '*.xml'):
279                 xfile = os.path.join(dirname,filename)
280                 doc = bindery.parse(xfile,
281                                     prefixes={u'db': u'http://docbook.org/ns/docbook',
282                                               u'xi': u'http://www.w3.org/2001/XInclude',
283                                               u'xl': u'http://www.w3.org/1999/xlink'})
284                 title = doc.xml_select(u'/db:article/db:info/db:title')
285                 menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
286                 code  = doc.xml_select(u"//xi:include[@parse='text']")
287                 resource = doc.xml_select(u"//db:link[@xl:href]")
288                 image = doc.xml_select(u"//db:imagedata[@fileref]")
289                 exe = 0
290                 for c in code:
291                     (p, ext) = os.path.splitext(c.href)
292                     if ext in valid_scripts:
293                         exe = 1
294
295                 if title and menu:
296                     found = 0
297                     base = xfile.split('.')[1]
298                     link = base.replace('index','')
299                     level = len(filter(None,re.split(r'(^/\w*/|\w*/)',link)))
300                     res = []
301                     for r in resource:
302                         rf = os.path.join(dirname,r.href)
303                         if os.path.isfile(rf):
304                             res.append(rf)
305                     for i in image:
306                         im = os.path.join(dirname,i.fileref)
307                         if os.path.isfile(im):
308                             res.append(im)
309                     page = dict(title=unicode(doc.article.info.title),
310                                 menu=unicode(doc.article.info.titleabbrev),
311                                 output=os.path.join(dirname,
312                                                     filename.replace('xml','html')),
313                                 exe=exe,
314                                 file=xfile,
315                                 res=res,
316                                 level=level)
317                     for l in sitemap:
318                         if l['link'] == link:
319                             found = 1
320                             l.update(page)
321                     if not found:
322                         print "adding "+link+" to sitemap"
323                         dd = dict(link=link)
324                         dd.update(page)
325                         sitemap.append(dd)
326     sfile = open('sitemap.txt','w')
327     for l in sitemap:
328         sfile.write(l['link']+'\n')
329     sfile.close()
330     return sitemap
331
332 def expandXincludeTxt(page):
333     doc = bindery.parse(page['file'],
334                         prefixes={u'db': u'http://docbook.org/ns/docbook',
335                                   u'xi': u'http://www.w3.org/2001/XInclude'})
336     if page['exe']:
337         code  = doc.xml_select(u"//xi:include[@parse='text']")
338         for c in code:
339             (p, ext) = os.path.splitext(c.href)
340             if ext in valid_scripts:
341                 exe = os.path.join(os.path.abspath(c.href))
342                 xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
343                 xstr = bindery.parse(str(xml.stdout.read()))
344                 id = c.xml_index_on_parent
345                 for x in xstr.xml_children:
346                     c.xml_parent.xml_insert(id,x)
347                 c.xml_parent.xml_remove(c)
348     return doc
349
350 def xsltConvert(doc):
351 #  amara can not handle the docbook stylesheets
352 #  xmlarticle = transform(doc,style_xslt)
353     cwd = os.getcwd()
354     rundir = os.path.dirname(page['file'])
355     os.chdir(rundir)
356     infile  = os.path.basename(tempfile.mktemp())
357     outfile = tempfile.mktemp()
358     tfi = open(infile,'w')
359     tfi.write(doc.xml_encode())
360     tfi.close()
361 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
362     cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
363     retcode = subprocess.call(cmd)
364     if retcode:
365         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
366     tfo = open(outfile,'r')
367     result = tfo.read()
368     tfo.close()
369     os.remove(infile)
370     os.remove(outfile)
371     os.chdir(cwd)
372     return result
373
374 def genMenu(page,sitemap,slevel,elevel):
375     title = None
376     sm = []
377     if elevel == MAXLEVEL or elevel == 1 or page == None:
378         html = '<ul>\n'
379         sm = sitemap
380     else:
381         html = '<ul class="tree">\n'
382         idx = sitemap.index(page)
383         while (sitemap[idx]['level'] == page['level']):
384             idx = idx-1
385         title = sitemap[idx]['menu']
386         idx = idx+1
387         while (idx < len(sitemap) and sitemap[idx]['level'] == page['level']):
388             sm.append(sitemap[idx])
389             idx = idx+1
390     oldlevel = slevel
391
392     for p in sm:
393         if slevel > p['level'] or elevel < p['level']:
394             continue
395         if not title and p['link'] == '/':
396             title = p['menu']
397
398         if oldlevel < p['level']:
399             html+='<ul>\n'
400         elif oldlevel > p['level']:
401             if p['link'][-1] == '/':
402                 html+='</li>\n'
403             html+='</ul>\n</li>\n'
404         if page != None and page == p:
405             html+='<li class="selected"><a href="%s">%s</a>' % (p['link'],p['menu'])
406         else:
407             html+='<li><a href="%s">%s</a>' % (p['link'],p['menu'])
408         if p['link'][-1] != '/' or p['link'] == '/':
409             html+='</li>\n'
410         oldlevel = p['level']
411     html+='</ul>\n'
412     return (html,title)
413
414 def writeToTemplate(page,doc,sitemap):
415     (menu,menuname) = genMenu(page,sitemap,1,MAXLEVEL)
416     (levelmenu,levelname) = genMenu(page,sitemap,page['level'],page['level'])
417     template = Template(file=style_tmpl,
418                         searchList=[{'title':page['title']},
419                                     {'menu':menu},
420                                     {'article':doc},
421                                     {'levelmenu':levelmenu},
422                                     {'levelname':levelname}])
423     outfile = tmptarget+page['output']
424     mkdir_p(os.path.dirname(outfile))
425     out = open(outfile, 'w')
426     out.write(str(template))
427     out.close()
428     for r in page['res']:
429         mkdir_p(os.path.dirname(tmptarget+r))
430         shutil.copyfile(r, tmptarget+r)
431
432 def createSitemap(sitemap):
433     (menu,menuname) = genMenu(None,sitemap,1,MAXLEVEL)
434     template = Template(file=style_tmpl,
435                         searchList=[
436             {'title':'Sitemap'},
437             {'menu':menu},
438             {'article':menu},
439             {'levelmenu':''},
440             {'levelname':''}])
441     outfile = tmptarget+'sitemap.en.html'
442     mkdir_p(os.path.dirname(outfile))
443     out = open(outfile, 'w')
444     out.write(str(template))
445     out.close()
446
447 dir_ = Directory()
448 sitemap = Sitemap()
449
450 dir_.scan()
451 sitemap.read_map()
452
453 missing = dir_.set() - sitemap.set()
454 removed = sitemap.set() - dir_.set()
455 for page in removed:
456     print removed+' pages missing!!'
457
458 for page in missing:
459     print 'adding missing page '+page
460     sitemap.add_page(page)
461
462 sitemap.graph()
463
464
465 sitemap = generateSitemap()
466 tmptarget = tempfile.mkdtemp()+'/'
467 for page in sitemap:
468     t1 = time.time()
469     print "Page : %-30s %30s" % (page['link'],
470                         time.ctime(os.stat(page['file']).st_mtime)),
471     doc = expandXincludeTxt(page)
472     pubdoc = xsltConvert(doc)
473     writeToTemplate(page,pubdoc,sitemap)
474     t2 = time.time()
475     print "[%5.2f s]" % (round(t2-t1,2))
476
477 createSitemap(sitemap)
478 publish(tmptarget, args.output)
479 publish(args.style+"css", args.output)
480 publish(args.style+"images",args.output)