02164390276de44978171c1ae21813fed213b66c
[treecutter.git] / src / tree-cutter.py
1 #!/usr/bin/python
2 import os
3 import fnmatch
4 import subprocess
5 import amara
6 import re
7 import tempfile
8 import errno
9 import time
10 import argparse
11 import shutil
12 import pygraphviz as pgv
13 import glob
14 from amara import bindery
15 from amara.xslt import transform
16 from Cheetah.Template import Template
17
18 parser = argparse.ArgumentParser(description='Process docbook article tree.')
19 parser.add_argument('--style', nargs='?',
20                     default=os.path.dirname(os.getcwd())+'/style/default/')
21 parser.add_argument('--output', nargs='?',
22                     default=os.path.dirname(os.getcwd())+'/htdocs/')
23 args = parser.parse_args()
24
25 style_xslt = args.style+"docbook.xsl"
26 style_tmpl = args.style+"index.en.html.tmpl"
27 outputdir = args.output
28
29 tmptarget = tempfile.mkdtemp()+'/'
30
31 valid_scripts = ['.py','.pl']
32 MAXLEVEL = 10000
33
34 def mkdir_p(path):
35     try:
36         os.makedirs(path)
37     except OSError as exc: # Python >2.5
38         if exc.errno == errno.EEXIST:
39             pass
40         else: raise
41
42 def publish(src,target):
43     cmd = ["rsync","-a","--delete",src,target]
44     retcode = subprocess.call(cmd)
45     if retcode:
46         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
47
48
49 PREFIXES={u'db': u'http://docbook.org/ns/docbook',
50           u'xi': u'http://www.w3.org/2001/XInclude',
51           u'xl': u'http://www.w3.org/1999/xlink'}
52
53 class Directory():
54     """Class containing the state of the directory with articles"""
55     def __init__(self):
56         self._cwd = '.'
57         self._tree = []
58
59     def scan(self):
60         for dirname, dirnames, filenames in os.walk(self._cwd):
61             for filename in filenames:
62                 if fnmatch.fnmatch(filename, '*.xml'):
63                     file_ = os.path.join(dirname,filename)
64                     doc = bindery.parse(file_, prefixes=PREFIXES)
65                     title = doc.xml_select(u'/db:article/db:info/db:title')
66                     menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
67                     if title and menu:
68                         base = file_.split('.')[1]
69                         link = base.replace('index','')
70                         self._tree.append(link)
71
72     def set(self):
73         return set(self._tree)
74
75 class Page():
76     """Class representing a version of a webpage"""
77     def __init__(self,page):
78         self._file = page[1]
79         self._lang = page[0]
80         self._doc = None
81         self._resources = []
82         self._title = None
83         self._menu = None
84         self._rendered_article = None
85
86     def language(self):
87         return self._lang
88
89     def menu(self):
90         return self._menu
91
92     def set_article(self,art):
93         self._rendered_article = art
94
95     def prepare(self):
96         self._doc = bindery.parse(self._file, prefixes=PREFIXES)
97         if self._doc.xml_select(u'/db:article/db:info/db:title'):
98             self._title = unicode(self._doc.article.info.title)
99         if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'):
100             self._menu = unicode(self._doc.article.info.titleabbrev)
101
102         dirname = os.path.dirname(self._file)
103         code  = self._doc.xml_select(u"//xi:include[@parse='text']")
104         if code:
105             for c in code:
106                 (p, ext) = os.path.splitext(c.href)
107                 if ext in valid_scripts:
108                     exe = os.path.join(os.path.abspath(dirname+c.href))
109                     xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
110                     xstr = bindery.parse(str(xml.stdout.read()))
111                     idp = c.xml_index_on_parent
112                     for x in xstr.xml_children:
113                         c.xml_parent.xml_insert(idp,x)
114                         c.xml_parent.xml_remove(c)
115
116         for r in self._doc.xml_select(u"//db:link[@xl:href]"):
117             rf = os.path.join(dirname,r.href)
118             if os.path.isfile(rf):
119                 self._resources.append(rf)
120         for i in self._doc.xml_select(u"//db:imagedata[@fileref]"):
121             im = os.path.join(dirname,i.fileref)
122             if os.path.isfile(im):
123                 self._resources.append(im)
124
125     def render(self):
126         #  amara can not handle the docbook stylesheets
127         #  xmlarticle = transform(doc,style_xslt)
128         cwd = os.getcwd()
129         dirname = os.path.dirname(self._file)
130         os.chdir(dirname)
131         infile  = os.path.basename(tempfile.mktemp())
132         outfile = tempfile.mktemp()
133         tfi = open(infile,'w')
134         tfi.write(self._doc.xml_encode())
135         tfi.close()
136 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
137         cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
138         retcode = subprocess.call(cmd)
139         if retcode:
140             print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
141         tfo = open(outfile,'r')
142         self._rendered_article = tfo.read()
143         tfo.close()
144         os.remove(infile)
145         os.remove(outfile)
146         os.chdir(cwd)
147
148     def template(self,sitemap):
149         htmlmenu =  sitemap.gen_menu(self._lang,None)
150         levelmenu = sitemap.gen_menu(self._lang,self)
151         template = Template(file=style_tmpl,
152                             searchList=[{'title':self._title},
153                                         {'menu':htmlmenu},
154                                         {'article':self._rendered_article},
155                                         {'levelmenu':levelmenu},
156                                         {'levelname':levelname}])
157         outfile = tmptarget+self._file+'.'+self._lang+'.html'
158         mkdir_p(os.path.dirname(outfile))
159         out = open(outfile, 'w')
160         out.write(str(template))
161         out.close()
162
163
164 class Link():
165     """Class representing a webpage on the site"""
166     def __init__(self,link):
167         self._link = link
168         # find the representations of the link.
169         self._pages = []
170         path = link
171         if self._link[-1] == '/':
172             path = path+'index'
173         lang = self._scan_languages(path)
174         for l in lang:
175             self._pages.append(Page(l))
176
177     def _scan_languages(self,path):
178         lang = []
179         for l in  glob.glob('.'+path+'*'):
180             ls = l.split('.')
181             if len(ls) > 3 and ls[3] == 'xml':
182                 lang.append((ls[2],l))
183         return lang
184
185     def link(self):
186         return self._link
187
188     def prepare(self):
189         for page in self._pages:
190             page.prepare()
191
192     def languages(self):
193         p = []
194         for page in self._pages:
195             p.append(page.language())
196         return p
197
198     def render(self):
199         for page in self._pages:
200             page.render()
201
202     def template(self,sitemap):
203         for page in self._pages:
204             page.template(sitemap)
205
206     def page(self,lang):
207         for page in self._pages:
208             if page.language()==lang:
209                 return page
210
211 class Node():
212     def __init__(self,token,value):
213         self._token = token
214         self._value = value
215         self._children = []
216
217     def token(self):
218         return self._token
219
220     def value(self):
221         return self._value
222
223     def children(self):
224         return self._children
225
226 class Trie():
227     def __init__(self):
228         self._root = []
229
230     def __iter__(self):
231         return self.inorder(self._root)
232
233     def inorder(self,t):
234         for l in t:
235             yield l.value()
236             for x in self.inorder(l.children()):
237                 yield x
238
239     def _add(self,trie, key, content):
240         # is the key a leaf
241         k = key.pop(0)
242         if key == []:
243             node = Node(k,content)
244             trie.append(node)
245         else:
246             for ch in trie:
247                 if ch.token() == k:
248                     self._add(ch.children(), key, content)
249
250     def add(self,key, content):
251         self._add(self._root, key, content)
252
253     def _graph(self, trie, G):
254         for l in trie:
255             G.add_node(l.token())
256             for ch in l.children():
257                 G.add_edge(l.token(),ch.token())
258                 self._graph(l.children(), G)
259
260     def graph(self):
261         G = pgv.AGraph(directed=True)
262         G.add_node("sitemap")
263         for ch in self._root:
264             G.add_edge("sitemap",ch.token())
265         self._graph(self._root, G)
266 #        G.layout('dot')
267 #        G.draw('g.png')
268 #        print G.string()
269
270 class Sitemap():
271     """Class keeping the internal site structure"""
272     def __init__(self):
273         self._file = 'sitemap.txt'
274         self._tree = Trie()
275
276     def add_link(self, link):
277         tokens = filter(None,re.split(r'(^/\w*/|\w*/)',link))
278         self._tree.add(tokens,Link(link))
279
280     def read_map(self):
281         try:
282             f = open(self._file)
283             sml = f.read().split()
284             f.close()
285             for line in sml:
286                 self.add_link(line)
287         except IOError, what_error:
288             print 'INFO: Could not read sitemap.txt - one will be created'
289
290     def set(self):
291         return set(link.link() for link in self._tree)
292
293     def graph(self):
294         self._tree.graph()
295
296     def gen_menu(self,lang,page):
297         return 'Generate menu from sitemap - To be implemented'
298
299 def generateSitemap():
300     sitemap = []
301     try:
302         sfile = open('sitemap.txt')
303         flist = sfile.read().split()
304         sfile.close()
305         for f in flist:
306             sitemap.append(dict(link=f))
307     except IOError, what_error:
308         print 'Sitemap missing - generating one.'
309
310     for dirname, dirnames, filenames in os.walk('.'):
311         for filename in filenames:
312             if fnmatch.fnmatch(filename, '*.xml'):
313                 xfile = os.path.join(dirname,filename)
314                 doc = bindery.parse(xfile,
315                                     prefixes={u'db': u'http://docbook.org/ns/docbook',
316                                               u'xi': u'http://www.w3.org/2001/XInclude',
317                                               u'xl': u'http://www.w3.org/1999/xlink'})
318                 title = doc.xml_select(u'/db:article/db:info/db:title')
319                 menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
320                 code  = doc.xml_select(u"//xi:include[@parse='text']")
321                 resource = doc.xml_select(u"//db:link[@xl:href]")
322                 image = doc.xml_select(u"//db:imagedata[@fileref]")
323                 exe = 0
324                 for c in code:
325                     (p, ext) = os.path.splitext(c.href)
326                     if ext in valid_scripts:
327                         exe = 1
328
329                 if title and menu:
330                     found = 0
331                     base = xfile.split('.')[1]
332                     link = base.replace('index','')
333                     level = len(filter(None,re.split(r'(^/\w*/|\w*/)',link)))
334                     res = []
335                     for r in resource:
336                         rf = os.path.join(dirname,r.href)
337                         if os.path.isfile(rf):
338                             res.append(rf)
339                     for i in image:
340                         im = os.path.join(dirname,i.fileref)
341                         if os.path.isfile(im):
342                             res.append(im)
343                     page = dict(title=unicode(doc.article.info.title),
344                                 menu=unicode(doc.article.info.titleabbrev),
345                                 output=os.path.join(dirname,
346                                                     filename.replace('xml','html')),
347                                 exe=exe,
348                                 file=xfile,
349                                 res=res,
350                                 level=level)
351                     for l in sitemap:
352                         if l['link'] == link:
353                             found = 1
354                             l.update(page)
355                     if not found:
356                         print "adding "+link+" to sitemap"
357                         dd = dict(link=link)
358                         dd.update(page)
359                         sitemap.append(dd)
360     sfile = open('sitemap.txt','w')
361     for l in sitemap:
362         sfile.write(l['link']+'\n')
363     sfile.close()
364     return sitemap
365
366 def expandXincludeTxt(page):
367     doc = bindery.parse(page['file'],
368                         prefixes={u'db': u'http://docbook.org/ns/docbook',
369                                   u'xi': u'http://www.w3.org/2001/XInclude'})
370     if page['exe']:
371         code  = doc.xml_select(u"//xi:include[@parse='text']")
372         for c in code:
373             (p, ext) = os.path.splitext(c.href)
374             if ext in valid_scripts:
375                 exe = os.path.join(os.path.abspath(c.href))
376                 xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
377                 xstr = bindery.parse(str(xml.stdout.read()))
378                 id = c.xml_index_on_parent
379                 for x in xstr.xml_children:
380                     c.xml_parent.xml_insert(id,x)
381                 c.xml_parent.xml_remove(c)
382     return doc
383
384 def xsltConvert(doc):
385 #  amara can not handle the docbook stylesheets
386 #  xmlarticle = transform(doc,style_xslt)
387     cwd = os.getcwd()
388     rundir = os.path.dirname(page['file'])
389     os.chdir(rundir)
390     infile  = os.path.basename(tempfile.mktemp())
391     outfile = tempfile.mktemp()
392     tfi = open(infile,'w')
393     tfi.write(doc.xml_encode())
394     tfi.close()
395 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
396     cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
397     retcode = subprocess.call(cmd)
398     if retcode:
399         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
400     tfo = open(outfile,'r')
401     result = tfo.read()
402     tfo.close()
403     os.remove(infile)
404     os.remove(outfile)
405     os.chdir(cwd)
406     return result
407
408 def genMenu(page,sitemap,slevel,elevel):
409     title = None
410     sm = []
411     if elevel == MAXLEVEL or elevel == 1 or page == None:
412         html = '<ul>\n'
413         sm = sitemap
414     else:
415         html = '<ul class="tree">\n'
416         idx = sitemap.index(page)
417         while (sitemap[idx]['level'] == page['level']):
418             idx = idx-1
419         title = sitemap[idx]['menu']
420         idx = idx+1
421         while (idx < len(sitemap) and sitemap[idx]['level'] == page['level']):
422             sm.append(sitemap[idx])
423             idx = idx+1
424     oldlevel = slevel
425
426     for p in sm:
427         if slevel > p['level'] or elevel < p['level']:
428             continue
429         if not title and p['link'] == '/':
430             title = p['menu']
431
432         if oldlevel < p['level']:
433             html+='<ul>\n'
434         elif oldlevel > p['level']:
435             if p['link'][-1] == '/':
436                 html+='</li>\n'
437             html+='</ul>\n</li>\n'
438         if page != None and page == p:
439             html+='<li class="selected"><a href="%s">%s</a>' % (p['link'],p['menu'])
440         else:
441             html+='<li><a href="%s">%s</a>' % (p['link'],p['menu'])
442         if p['link'][-1] != '/' or p['link'] == '/':
443             html+='</li>\n'
444         oldlevel = p['level']
445     html+='</ul>\n'
446     return (html,title)
447
448 def writeToTemplate(page,doc,sitemap):
449     (menu,menuname) = genMenu(page,sitemap,1,MAXLEVEL)
450     (levelmenu,levelname) = genMenu(page,sitemap,page['level'],page['level'])
451     template = Template(file=style_tmpl,
452                         searchList=[{'title':page['title']},
453                                     {'menu':menu},
454                                     {'article':doc},
455                                     {'levelmenu':levelmenu},
456                                     {'levelname':levelname}])
457     outfile = tmptarget+page['output']
458     mkdir_p(os.path.dirname(outfile))
459     out = open(outfile, 'w')
460     out.write(str(template))
461     out.close()
462     for r in page['res']:
463         mkdir_p(os.path.dirname(tmptarget+r))
464         shutil.copyfile(r, tmptarget+r)
465
466 def createSitemap(sitemap):
467     (menu,menuname) = genMenu(None,sitemap,1,MAXLEVEL)
468     template = Template(file=style_tmpl,
469                         searchList=[
470             {'title':'Sitemap'},
471             {'menu':menu},
472             {'article':menu},
473             {'levelmenu':''},
474             {'levelname':''}])
475     outfile = tmptarget+'sitemap.en.html'
476     mkdir_p(os.path.dirname(outfile))
477     out = open(outfile, 'w')
478     out.write(str(template))
479     out.close()
480
481 dir_ = Directory()
482 sitemap = Sitemap()
483
484 dir_.scan()
485 sitemap.read_map()
486
487 missing = dir_.set() - sitemap.set()
488 removed = sitemap.set() - dir_.set()
489 for page in removed:
490     print removed+' pages missing!!'
491
492 for page in missing:
493     print 'adding missing page '+page
494     sitemap.add_page(page)
495
496 sitemap.graph()
497
498
499 sitemap = generateSitemap()
500 tmptarget = tempfile.mkdtemp()+'/'
501 for page in sitemap:
502     t1 = time.time()
503     print "Page : %-30s %30s" % (page['link'],
504                         time.ctime(os.stat(page['file']).st_mtime)),
505     doc = expandXincludeTxt(page)
506     pubdoc = xsltConvert(doc)
507     writeToTemplate(page,pubdoc,sitemap)
508     t2 = time.time()
509     print "[%5.2f s]" % (round(t2-t1,2))
510
511 createSitemap(sitemap)
512 publish(tmptarget, args.output)
513 publish(args.style+"css", args.output)
514 publish(args.style+"images",args.output)