b21c09188d1022f69ec294edf7aab08a55319883
[treecutter.git] / src / tree-cutter.py
1 #!/usr/bin/python
2 import os
3 import fnmatch
4 import subprocess
5 import amara
6 import re
7 import tempfile
8 import errno
9 import time
10 import argparse
11 import shutil
12 import pygraphviz as pgv
13 import glob
14 from amara import bindery
15 from amara.xslt import transform
16 from Cheetah.Template import Template
17
18 parser = argparse.ArgumentParser(description='Process docbook article tree.')
19 parser.add_argument('--style', nargs='?',
20                     default=os.path.dirname(os.getcwd())+'/style/default/')
21 parser.add_argument('--output', nargs='?',
22                     default=os.path.dirname(os.getcwd())+'/htdocs/')
23 args = parser.parse_args()
24
25 style_xslt = args.style+"docbook.xsl"
26 style_tmpl = args.style+"index.en.html.tmpl"
27 outputdir = args.output
28
29 tmptarget = tempfile.mkdtemp()+'/'
30
31 valid_scripts = ['.py','.pl']
32 MAXLEVEL = 10000
33
34 def mkdir_p(path):
35     try:
36         os.makedirs(path)
37     except OSError as exc: # Python >2.5
38         if exc.errno == errno.EEXIST:
39             pass
40         else: raise
41
42 def publish(src,target):
43     cmd = ["rsync","-a","--delete",src,target]
44     retcode = subprocess.call(cmd)
45     if retcode:
46         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
47
48
49 PREFIXES={u'db': u'http://docbook.org/ns/docbook',
50           u'xi': u'http://www.w3.org/2001/XInclude',
51           u'xl': u'http://www.w3.org/1999/xlink'}
52
53 class Directory():
54     """Class containing the state of the directory with articles"""
55     def __init__(self):
56         self._cwd = '.'
57         self._tree = []
58
59     def scan(self):
60         for dirname, dirnames, filenames in os.walk(self._cwd):
61             for filename in filenames:
62                 if fnmatch.fnmatch(filename, '*.xml'):
63                     file_ = os.path.join(dirname,filename)
64                     doc = bindery.parse(file_, prefixes=PREFIXES)
65                     title = doc.xml_select(u'/db:article/db:info/db:title')
66                     menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
67                     if title and menu:
68                         base = file_.split('.')[1]
69                         link = base.replace('index','')
70                         self._tree.append(link)
71
72     def set(self):
73         return set(self._tree)
74
75 class Page():
76     """Class representing a version of a webpage"""
77     def __init__(self,page):
78         self._file = page[1]
79         self._lang = page[0]
80         self._doc = None
81         self._resources = []
82         self._title = None
83         self._menu = None
84         self._rendered_article = None
85
86     def language(self):
87         return self._lang
88
89     def menu(self):
90         return self._menu
91
92     def set_article(self,art):
93         self._rendered_article = art
94
95     def prepare(self):
96         self._doc = bindery.parse(self._file, prefixes=PREFIXES)
97         if self._doc.xml_select(u'/db:article/db:info/db:title'):
98             self._title = unicode(self._doc.article.info.title)
99         if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'):
100             self._menu = unicode(self._doc.article.info.titleabbrev)
101
102         dirname = os.path.dirname(self._file)
103         code  = self._doc.xml_select(u"//xi:include[@parse='text']")
104         if code:
105             for c in code:
106                 (p, ext) = os.path.splitext(c.href)
107                 if ext in valid_scripts:
108                     exe = os.path.join(os.path.abspath(dirname+c.href))
109                     xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
110                     xstr = bindery.parse(str(xml.stdout.read()))
111                     idp = c.xml_index_on_parent
112                     for x in xstr.xml_children:
113                         c.xml_parent.xml_insert(idp,x)
114                         c.xml_parent.xml_remove(c)
115
116         for r in self._doc.xml_select(u"//db:link[@xl:href]"):
117             rf = os.path.join(dirname,r.href)
118             if os.path.isfile(rf):
119                 self._resources.append(rf)
120         for i in self._doc.xml_select(u"//db:imagedata[@fileref]"):
121             im = os.path.join(dirname,i.fileref)
122             if os.path.isfile(im):
123                 self._resources.append(im)
124
125     def render(self):
126         #  amara can not handle the docbook stylesheets
127         #  xmlarticle = transform(doc,style_xslt)
128         cwd = os.getcwd()
129         dirname = os.path.dirname(self._file)
130         os.chdir(dirname)
131         infile  = os.path.basename(tempfile.mktemp())
132         outfile = tempfile.mktemp()
133         tfi = open(infile,'w')
134         tfi.write(self._doc.xml_encode())
135         tfi.close()
136 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
137         cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
138         retcode = subprocess.call(cmd)
139         if retcode:
140             print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
141         tfo = open(outfile,'r')
142         self._rendered_article = tfo.read()
143         tfo.close()
144         os.remove(infile)
145         os.remove(outfile)
146         os.chdir(cwd)
147
148     def template(self,sitemap):
149         htmlmenu =  sitemap.gen_menu(self._lang,None,None)
150         levelmenu = sitemap.gen_menu(self._lang,self,"tree")
151         template = Template(file=style_tmpl,
152                             searchList=[{'title':self._title},
153                                         {'menu':htmlmenu},
154                                         {'article':self._rendered_article},
155                                         {'levelmenu':levelmenu},
156                                         {'levelname':'Menu'}])
157         outfile = tmptarget+'html'.join(self._file.rsplit('xml',1))
158         mkdir_p(os.path.dirname(outfile))
159         out = open(outfile, 'w')
160         out.write(str(template))
161         out.close()
162
163
164 class Link():
165     """Class representing a webpage on the site"""
166     def __init__(self,link):
167         self._link = link
168         # find the representations of the link.
169         self._pages = []
170         path = link
171         if self._link[-1] == '/':
172             path = path+'index'
173         lang = self._scan_languages(path)
174         for l in lang:
175             self._pages.append(Page(l))
176
177     def _scan_languages(self,path):
178         lang = []
179         for l in  glob.glob('.'+path+'*'):
180             ls = l.split('.')
181             if len(ls) > 3 and ls[3] == 'xml':
182                 lang.append((ls[2],l))
183         return lang
184
185     def link(self):
186         return self._link
187
188     def prepare(self):
189         for page in self._pages:
190             page.prepare()
191
192     def languages(self):
193         p = []
194         for page in self._pages:
195             p.append(page.language())
196         return p
197
198     def render(self):
199         for page in self._pages:
200             page.render()
201
202     def template(self,sitemap):
203         for page in self._pages:
204             page.template(sitemap)
205
206     def page(self,lang):
207         for page in self._pages:
208             if page.language()==lang:
209                 return page
210
211 class Node():
212     def __init__(self,token,value):
213         self._token = token
214         self._value = value
215         self._children = []
216
217     def token(self):
218         return self._token
219
220     def value(self):
221         return self._value
222
223     def children(self):
224         return self._children
225
226 class Trie():
227     def __init__(self):
228         self._root = []
229
230     def __iter__(self):
231         return self.inorder(self._root)
232
233     def inorder(self,t):
234         for l in t:
235             yield l.value()
236             for x in self.inorder(l.children()):
237                 yield x
238
239     def _add(self,trie, key, content):
240         # is the key a leaf
241         k = key.pop(0)
242         if key == []:
243             node = Node(k,content)
244             trie.append(node)
245         else:
246             for ch in trie:
247                 if ch.token() == k:
248                     self._add(ch.children(), key, content)
249
250     def add(self,key, content):
251         self._add(self._root, key, content)
252
253     def _graph(self, trie, G):
254         for l in trie:
255             G.add_node(l.token())
256             for ch in l.children():
257                 G.add_edge(l.token(),ch.token())
258                 self._graph(l.children(), G)
259
260     def graph(self):
261         G = pgv.AGraph(directed=True)
262         G.add_node("sitemap")
263         for ch in self._root:
264             G.add_edge("sitemap",ch.token())
265         self._graph(self._root, G)
266 #        G.layout('dot')
267 #        G.draw('g.png')
268 #        print G.string()
269
270     def _menu(self, trie, lang, page, css):
271         html = "<ul%s>\n" % css
272         for l in trie:
273             sel = ''
274             if l.value().page(lang) == page:
275                 sel = ' class="selected"'
276             html += '<li%s><a href="%s">%s</a>\n' \
277             % (sel,l.value().link(),l.value().page(lang).menu())
278             html += self._menu(l.children(), lang, page, "")
279         html += "</ul>\n"
280         return html
281
282     def menu(self,lang,page,cssclass):
283         css = ''
284         if cssclass:
285             css = ' class="'+cssclass+'"'
286         return self._menu(self._root, lang, page, css)
287
288 class Sitemap():
289     """Class keeping the internal site structure"""
290     def __init__(self):
291         self._file = 'sitemap.txt'
292         self._tree = Trie()
293
294     def add_link(self, link):
295         tokens = filter(None,re.split(r'(^/\w*/|\w*/)',link))
296         self._tree.add(tokens,Link(link))
297
298     def write_map(self):
299         f = open(self._file,'w')
300         f.write('\n'.join(link.link() for link in self._tree))
301         f.close()
302
303     def read_map(self):
304         try:
305             f = open(self._file)
306             sml = f.read().split()
307             f.close()
308             for line in sml:
309                 self.add_link(line)
310         except IOError, what_error:
311             print 'INFO: Could not read sitemap.txt - one will be created'
312
313     def set(self):
314         return set(link.link() for link in self._tree)
315
316     def graph(self):
317         self._tree.graph()
318
319     def gen_menu(self,lang,page,cssclass):
320         return self._tree.menu(lang,page,cssclass)
321
322 def generateSitemap():
323     sitemap = []
324     try:
325         sfile = open('sitemap.txt')
326         flist = sfile.read().split()
327         sfile.close()
328         for f in flist:
329             sitemap.append(dict(link=f))
330     except IOError, what_error:
331         print 'Sitemap missing - generating one.'
332
333     for dirname, dirnames, filenames in os.walk('.'):
334         for filename in filenames:
335             if fnmatch.fnmatch(filename, '*.xml'):
336                 xfile = os.path.join(dirname,filename)
337                 doc = bindery.parse(xfile,
338                                     prefixes={u'db': u'http://docbook.org/ns/docbook',
339                                               u'xi': u'http://www.w3.org/2001/XInclude',
340                                               u'xl': u'http://www.w3.org/1999/xlink'})
341                 title = doc.xml_select(u'/db:article/db:info/db:title')
342                 menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
343                 code  = doc.xml_select(u"//xi:include[@parse='text']")
344                 resource = doc.xml_select(u"//db:link[@xl:href]")
345                 image = doc.xml_select(u"//db:imagedata[@fileref]")
346                 exe = 0
347                 for c in code:
348                     (p, ext) = os.path.splitext(c.href)
349                     if ext in valid_scripts:
350                         exe = 1
351
352                 if title and menu:
353                     found = 0
354                     base = xfile.split('.')[1]
355                     link = base.replace('index','')
356                     level = len(filter(None,re.split(r'(^/\w*/|\w*/)',link)))
357                     res = []
358                     for r in resource:
359                         rf = os.path.join(dirname,r.href)
360                         if os.path.isfile(rf):
361                             res.append(rf)
362                     for i in image:
363                         im = os.path.join(dirname,i.fileref)
364                         if os.path.isfile(im):
365                             res.append(im)
366                     page = dict(title=unicode(doc.article.info.title),
367                                 menu=unicode(doc.article.info.titleabbrev),
368                                 output=os.path.join(dirname,
369                                                     filename.replace('xml','html')),
370                                 exe=exe,
371                                 file=xfile,
372                                 res=res,
373                                 level=level)
374                     for l in sitemap:
375                         if l['link'] == link:
376                             found = 1
377                             l.update(page)
378                     if not found:
379                         print "adding "+link+" to sitemap"
380                         dd = dict(link=link)
381                         dd.update(page)
382                         sitemap.append(dd)
383     sfile = open('sitemap.txt','w')
384     for l in sitemap:
385         sfile.write(l['link']+'\n')
386     sfile.close()
387     return sitemap
388
389 def expandXincludeTxt(page):
390     doc = bindery.parse(page['file'],
391                         prefixes={u'db': u'http://docbook.org/ns/docbook',
392                                   u'xi': u'http://www.w3.org/2001/XInclude'})
393     if page['exe']:
394         code  = doc.xml_select(u"//xi:include[@parse='text']")
395         for c in code:
396             (p, ext) = os.path.splitext(c.href)
397             if ext in valid_scripts:
398                 exe = os.path.join(os.path.abspath(c.href))
399                 xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
400                 xstr = bindery.parse(str(xml.stdout.read()))
401                 id = c.xml_index_on_parent
402                 for x in xstr.xml_children:
403                     c.xml_parent.xml_insert(id,x)
404                 c.xml_parent.xml_remove(c)
405     return doc
406
407 def xsltConvert(doc):
408 #  amara can not handle the docbook stylesheets
409 #  xmlarticle = transform(doc,style_xslt)
410     cwd = os.getcwd()
411     rundir = os.path.dirname(page['file'])
412     os.chdir(rundir)
413     infile  = os.path.basename(tempfile.mktemp())
414     outfile = tempfile.mktemp()
415     tfi = open(infile,'w')
416     tfi.write(doc.xml_encode())
417     tfi.close()
418 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
419     cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
420     retcode = subprocess.call(cmd)
421     if retcode:
422         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
423     tfo = open(outfile,'r')
424     result = tfo.read()
425     tfo.close()
426     os.remove(infile)
427     os.remove(outfile)
428     os.chdir(cwd)
429     return result
430
431 def genMenu(page,sitemap,slevel,elevel):
432     title = None
433     sm = []
434     if elevel == MAXLEVEL or elevel == 1 or page == None:
435         html = '<ul>\n'
436         sm = sitemap
437     else:
438         html = '<ul class="tree">\n'
439         idx = sitemap.index(page)
440         while (sitemap[idx]['level'] == page['level']):
441             idx = idx-1
442         title = sitemap[idx]['menu']
443         idx = idx+1
444         while (idx < len(sitemap) and sitemap[idx]['level'] == page['level']):
445             sm.append(sitemap[idx])
446             idx = idx+1
447     oldlevel = slevel
448
449     for p in sm:
450         if slevel > p['level'] or elevel < p['level']:
451             continue
452         if not title and p['link'] == '/':
453             title = p['menu']
454
455         if oldlevel < p['level']:
456             html+='<ul>\n'
457         elif oldlevel > p['level']:
458             if p['link'][-1] == '/':
459                 html+='</li>\n'
460             html+='</ul>\n</li>\n'
461         if page != None and page == p:
462             html+='<li class="selected"><a href="%s">%s</a>' % (p['link'],p['menu'])
463         else:
464             html+='<li><a href="%s">%s</a>' % (p['link'],p['menu'])
465         if p['link'][-1] != '/' or p['link'] == '/':
466             html+='</li>\n'
467         oldlevel = p['level']
468     html+='</ul>\n'
469     return (html,title)
470
471 def writeToTemplate(page,doc,sitemap):
472     (menu,menuname) = genMenu(page,sitemap,1,MAXLEVEL)
473     (levelmenu,levelname) = genMenu(page,sitemap,page['level'],page['level'])
474     template = Template(file=style_tmpl,
475                         searchList=[{'title':page['title']},
476                                     {'menu':menu},
477                                     {'article':doc},
478                                     {'levelmenu':levelmenu},
479                                     {'levelname':levelname}])
480     outfile = tmptarget+page['output']
481     mkdir_p(os.path.dirname(outfile))
482     out = open(outfile, 'w')
483     out.write(str(template))
484     out.close()
485     for r in page['res']:
486         mkdir_p(os.path.dirname(tmptarget+r))
487         shutil.copyfile(r, tmptarget+r)
488
489 def createSitemap(sitemap):
490     (menu,menuname) = genMenu(None,sitemap,1,MAXLEVEL)
491     template = Template(file=style_tmpl,
492                         searchList=[
493             {'title':'Sitemap'},
494             {'menu':menu},
495             {'article':menu},
496             {'levelmenu':''},
497             {'levelname':''}])
498     outfile = tmptarget+'sitemap.en.html'
499     mkdir_p(os.path.dirname(outfile))
500     out = open(outfile, 'w')
501     out.write(str(template))
502     out.close()
503
504 dir_ = Directory()
505 sitemap = Sitemap()
506
507 dir_.scan()
508 sitemap.read_map()
509
510 missing = dir_.set() - sitemap.set()
511 removed = sitemap.set() - dir_.set()
512 for page in removed:
513     print removed+' pages missing!!'
514 for page in missing:
515     print 'adding missing page '+page
516     sitemap.add_link(page)
517 if len(missing & removed) != 0:
518     print 'writing new sitemap - please adjust if needed'
519     sitemap.write_map()
520 sitemap.graph()
521
522
523 sitemap = generateSitemap()
524 tmptarget = tempfile.mkdtemp()+'/'
525 for page in sitemap:
526     t1 = time.time()
527     print "Page : %-30s %30s" % (page['link'],
528                         time.ctime(os.stat(page['file']).st_mtime)),
529     doc = expandXincludeTxt(page)
530     pubdoc = xsltConvert(doc)
531     writeToTemplate(page,pubdoc,sitemap)
532     t2 = time.time()
533     print "[%5.2f s]" % (round(t2-t1,2))
534
535 createSitemap(sitemap)
536 publish(tmptarget, args.output)
537 publish(args.style+"css", args.output)
538 publish(args.style+"images",args.output)