Initial layout of what work the Page object has to do, based on non object version.
[treecutter.git] / src / tree-cutter.py
1 #!/usr/bin/python
2 import os
3 import fnmatch
4 import subprocess
5 import amara
6 import re
7 import tempfile
8 import errno
9 import time
10 import argparse
11 import shutil
12 import pygraphviz as pgv
13 import glob
14 from amara import bindery
15 from amara.xslt import transform
16 from Cheetah.Template import Template
17
18 parser = argparse.ArgumentParser(description='Process docbook article tree.')
19 parser.add_argument('--style', nargs='?',
20                     default=os.path.dirname(os.getcwd())+'/style/default/')
21 parser.add_argument('--output', nargs='?',
22                     default=os.path.dirname(os.getcwd())+'/htdocs/')
23 args = parser.parse_args()
24
25 style_xslt = args.style+"docbook.xsl"
26 style_tmpl = args.style+"index.en.html.tmpl"
27 outputdir = args.output
28
29 valid_scripts = ['.py','.pl']
30 MAXLEVEL = 10000
31
32 def mkdir_p(path):
33     try:
34         os.makedirs(path)
35     except OSError as exc: # Python >2.5
36         if exc.errno == errno.EEXIST:
37             pass
38         else: raise
39
40 def publish(src,target):
41     cmd = ["rsync","-a","--delete",src,target]
42     retcode = subprocess.call(cmd)
43     if retcode:
44         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
45
46
47 PREFIXES={u'db': u'http://docbook.org/ns/docbook',
48           u'xi': u'http://www.w3.org/2001/XInclude',
49           u'xl': u'http://www.w3.org/1999/xlink'}
50
51 class Directory():
52     """Class containing the state of the directory with articles"""
53     def __init__(self):
54         self._cwd = '.'
55         self._tree = []
56
57     def scan(self):
58         for dirname, dirnames, filenames in os.walk(self._cwd):
59             for filename in filenames:
60                 if fnmatch.fnmatch(filename, '*.xml'):
61                     file_ = os.path.join(dirname,filename)
62                     doc = bindery.parse(file_, prefixes=PREFIXES)
63                     title = doc.xml_select(u'/db:article/db:info/db:title')
64                     menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
65                     if title and menu:
66                         base = file_.split('.')[1]
67                         link = base.replace('index','')
68                         self._tree.append(link)
69
70     def set(self):
71         return set(self._tree)
72
73 class Page():
74     """Class representing a version of a webpage"""
75     def __init__(self,page):
76         self._file = page[1]
77         self._lang = page[0]
78         self._doc = None
79         self._resources = []
80         self._title = None
81         self._menu = None
82         self._rendered_article = None
83
84     def prepare(self):
85         self._doc = bindery.parse(self._file, prefixes=PREFIXES)
86         if self._doc.xml_select(u'/db:article/db:info/db:title'):
87             self._title = unicode(doc.article.info.title)
88         if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'):
89             self._menu = unicode(doc.article.info.titleabbrev)
90
91         dirname = os.path.dirname(self._file)
92         code  = self._doc.xml_select(u"//xi:include[@parse='text']")
93         if code:
94             for c in code:
95                 (p, ext) = os.path.splitext(c.href)
96                 if ext in valid_scripts:
97                     exe = os.path.join(os.path.abspath(dirname+c.href))
98                     xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
99                     xstr = bindery.parse(str(xml.stdout.read()))
100                     idp = c.xml_index_on_parent
101                     for x in xstr.xml_children:
102                         c.xml_parent.xml_insert(idp,x)
103                         c.xml_parent.xml_remove(c)
104
105         for r in self._doc.xml_select(u"//db:link[@xl:href]"):
106             rf = os.path.join(dirname,r.href)
107             if os.path.isfile(rf):
108                 self._resources.append(rf)
109         for i in self._doc.xml_select(u"//db:imagedata[@fileref]"):
110             im = os.path.join(dirname,i.fileref)
111             if os.path.isfile(im):
112                 self._resources.append(im)
113
114     def render(self):
115         #  amara can not handle the docbook stylesheets
116         #  xmlarticle = transform(doc,style_xslt)
117         cwd = os.getcwd()
118         dirname = os.path.dirname(self._file)
119         os.chdir(dirname)
120         infile  = os.path.basename(tempfile.mktemp())
121         outfile = tempfile.mktemp()
122         tfi = open(infile,'w')
123         tfi.write(doc.xml_encode())
124         tfi.close()
125 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
126         cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
127         retcode = subprocess.call(cmd)
128         if retcode:
129             print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
130         tfo = open(outfile,'r')
131         self._rendered_article = tfo.read()
132         tfo.close()
133         os.remove(infile)
134         os.remove(outfile)
135         os.chdir(cwd)
136
137     def template(self,sitemap):
138         htmlmenu =  sitemap.gen_menu(self._lang,None)
139         levelmenu = sitemap.gen_menu(self._lang,self)
140         template = Template(file=style_tmpl,
141                             searchList=[{'title':self._title},
142                                         {'menu':htmlmenu},
143                                         {'article':self._rendered_article},
144                                         {'levelmenu':levelmenu},
145                                         {'levelname':levelname}])
146         outfile = tmptarget+self._file+'.'+self._lang+'.html'
147         mkdir_p(os.path.dirname(outfile))
148         out = open(outfile, 'w')
149         out.write(str(template))
150         out.close()
151
152
153 class Link():
154     """Class representing a webpage on the site"""
155     def __init__(self,link):
156         self._link = link
157         # find the representations of the link.
158         self._pages = []
159         path = link
160         if self._link[-1] == '/':
161             path = path+'index'
162         lang = self._scan_languages(path)
163         for l in lang:
164             self._pages.append(Page(l))
165
166     def _scan_languages(self,path):
167         lang = []
168         for l in  glob.glob('.'+path+'*'):
169             ls = l.split('.')
170             if len(ls) > 3 and ls[3] == 'xml':
171                 lang.append((ls[2],l))
172         return lang
173
174     def link(self):
175         return self._link
176
177 class Node():
178     def __init__(self,token,value):
179         self._token = token
180         self._value = value
181         self._children = []
182
183     def token(self):
184         return self._token
185
186     def value(self):
187         return self._value
188
189     def children(self):
190         return self._children
191
192 class Trie():
193     def __init__(self):
194         self._root = []
195
196     def _add(self,trie, key, content):
197         # is the key a leaf
198         k = key.pop(0)
199         if key == []:
200             node = Node(k,content)
201             trie.append(node)
202         else:
203             for ch in trie:
204                 if ch.token() == k:
205                     self._add(ch.children(), key, content)
206
207     def add(self,key, content):
208         self._add(self._root, key, content)
209
210     def _graph(self, trie, G):
211         for l in trie:
212             G.add_node(l.token())
213             for ch in l.children():
214                 G.add_edge(l.token(),ch.token())
215                 self._graph(l.children(), G)
216
217     def graph(self):
218         G = pgv.AGraph(directed=True)
219         G.add_node("sitemap")
220         for ch in self._root:
221             G.add_edge("sitemap",ch.token())
222         self._graph(self._root, G)
223 #        G.layout('dot')
224 #        G.draw('g.png')
225         print G.string()
226
227 class Sitemap():
228     """Class keeping the internal site structure"""
229     def __init__(self):
230         self._file = 'sitemap.txt'
231         self._tree = Trie()
232
233     def add_link(self, link):
234         tokens = filter(None,re.split(r'(^/\w*/|\w*/)',link))
235         self._tree.add(tokens,Link(link))
236
237     def read_map(self):
238         try:
239             f = open(self._file)
240             sml = f.read().split()
241             f.close()
242             for line in sml:
243                 self.add_link(line)
244         except IOError, what_error:
245             print 'INFO: Could not read sitemap.txt - one will be created'
246
247     def set(self):
248         return set(link.link() for link in self._tree)
249
250     def graph(self):
251         self._tree.graph()
252
253 def generateSitemap():
254     sitemap = []
255     try:
256         sfile = open('sitemap.txt')
257         flist = sfile.read().split()
258         sfile.close()
259         for f in flist:
260             sitemap.append(dict(link=f))
261     except IOError, what_error:
262         print 'Sitemap missing - generating one.'
263
264     for dirname, dirnames, filenames in os.walk('.'):
265         for filename in filenames:
266             if fnmatch.fnmatch(filename, '*.xml'):
267                 xfile = os.path.join(dirname,filename)
268                 doc = bindery.parse(xfile,
269                                     prefixes={u'db': u'http://docbook.org/ns/docbook',
270                                               u'xi': u'http://www.w3.org/2001/XInclude',
271                                               u'xl': u'http://www.w3.org/1999/xlink'})
272                 title = doc.xml_select(u'/db:article/db:info/db:title')
273                 menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
274                 code  = doc.xml_select(u"//xi:include[@parse='text']")
275                 resource = doc.xml_select(u"//db:link[@xl:href]")
276                 image = doc.xml_select(u"//db:imagedata[@fileref]")
277                 exe = 0
278                 for c in code:
279                     (p, ext) = os.path.splitext(c.href)
280                     if ext in valid_scripts:
281                         exe = 1
282
283                 if title and menu:
284                     found = 0
285                     base = xfile.split('.')[1]
286                     link = base.replace('index','')
287                     level = len(filter(None,re.split(r'(^/\w*/|\w*/)',link)))
288                     res = []
289                     for r in resource:
290                         rf = os.path.join(dirname,r.href)
291                         if os.path.isfile(rf):
292                             res.append(rf)
293                     for i in image:
294                         im = os.path.join(dirname,i.fileref)
295                         if os.path.isfile(im):
296                             res.append(im)
297                     page = dict(title=unicode(doc.article.info.title),
298                                 menu=unicode(doc.article.info.titleabbrev),
299                                 output=os.path.join(dirname,
300                                                     filename.replace('xml','html')),
301                                 exe=exe,
302                                 file=xfile,
303                                 res=res,
304                                 level=level)
305                     for l in sitemap:
306                         if l['link'] == link:
307                             found = 1
308                             l.update(page)
309                     if not found:
310                         print "adding "+link+" to sitemap"
311                         dd = dict(link=link)
312                         dd.update(page)
313                         sitemap.append(dd)
314     sfile = open('sitemap.txt','w')
315     for l in sitemap:
316         sfile.write(l['link']+'\n')
317     sfile.close()
318     return sitemap
319
320 def expandXincludeTxt(page):
321     doc = bindery.parse(page['file'],
322                         prefixes={u'db': u'http://docbook.org/ns/docbook',
323                                   u'xi': u'http://www.w3.org/2001/XInclude'})
324     if page['exe']:
325         code  = doc.xml_select(u"//xi:include[@parse='text']")
326         for c in code:
327             (p, ext) = os.path.splitext(c.href)
328             if ext in valid_scripts:
329                 exe = os.path.join(os.path.abspath(c.href))
330                 xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
331                 xstr = bindery.parse(str(xml.stdout.read()))
332                 id = c.xml_index_on_parent
333                 for x in xstr.xml_children:
334                     c.xml_parent.xml_insert(id,x)
335                 c.xml_parent.xml_remove(c)
336     return doc
337
338 def xsltConvert(doc):
339 #  amara can not handle the docbook stylesheets
340 #  xmlarticle = transform(doc,style_xslt)
341     cwd = os.getcwd()
342     rundir = os.path.dirname(page['file'])
343     os.chdir(rundir)
344     infile  = os.path.basename(tempfile.mktemp())
345     outfile = tempfile.mktemp()
346     tfi = open(infile,'w')
347     tfi.write(doc.xml_encode())
348     tfi.close()
349 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
350     cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
351     retcode = subprocess.call(cmd)
352     if retcode:
353         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
354     tfo = open(outfile,'r')
355     result = tfo.read()
356     tfo.close()
357     os.remove(infile)
358     os.remove(outfile)
359     os.chdir(cwd)
360     return result
361
362 def genMenu(page,sitemap,slevel,elevel):
363     title = None
364     sm = []
365     if elevel == MAXLEVEL or elevel == 1 or page == None:
366         html = '<ul>\n'
367         sm = sitemap
368     else:
369         html = '<ul class="tree">\n'
370         idx = sitemap.index(page)
371         while (sitemap[idx]['level'] == page['level']):
372             idx = idx-1
373         title = sitemap[idx]['menu']
374         idx = idx+1
375         while (idx < len(sitemap) and sitemap[idx]['level'] == page['level']):
376             sm.append(sitemap[idx])
377             idx = idx+1
378     oldlevel = slevel
379
380     for p in sm:
381         if slevel > p['level'] or elevel < p['level']:
382             continue
383         if not title and p['link'] == '/':
384             title = p['menu']
385
386         if oldlevel < p['level']:
387             html+='<ul>\n'
388         elif oldlevel > p['level']:
389             if p['link'][-1] == '/':
390                 html+='</li>\n'
391             html+='</ul>\n</li>\n'
392         if page != None and page == p:
393             html+='<li class="selected"><a href="%s">%s</a>' % (p['link'],p['menu'])
394         else:
395             html+='<li><a href="%s">%s</a>' % (p['link'],p['menu'])
396         if p['link'][-1] != '/' or p['link'] == '/':
397             html+='</li>\n'
398         oldlevel = p['level']
399     html+='</ul>\n'
400     return (html,title)
401
402 def writeToTemplate(page,doc,sitemap):
403     (menu,menuname) = genMenu(page,sitemap,1,MAXLEVEL)
404     (levelmenu,levelname) = genMenu(page,sitemap,page['level'],page['level'])
405     template = Template(file=style_tmpl,
406                         searchList=[{'title':page['title']},
407                                     {'menu':menu},
408                                     {'article':doc},
409                                     {'levelmenu':levelmenu},
410                                     {'levelname':levelname}])
411     outfile = tmptarget+page['output']
412     mkdir_p(os.path.dirname(outfile))
413     out = open(outfile, 'w')
414     out.write(str(template))
415     out.close()
416     for r in page['res']:
417         mkdir_p(os.path.dirname(tmptarget+r))
418         shutil.copyfile(r, tmptarget+r)
419
420 def createSitemap(sitemap):
421     (menu,menuname) = genMenu(None,sitemap,1,MAXLEVEL)
422     template = Template(file=style_tmpl,
423                         searchList=[
424             {'title':'Sitemap'},
425             {'menu':menu},
426             {'article':menu},
427             {'levelmenu':''},
428             {'levelname':''}])
429     outfile = tmptarget+'sitemap.en.html'
430     mkdir_p(os.path.dirname(outfile))
431     out = open(outfile, 'w')
432     out.write(str(template))
433     out.close()
434
435 dir_ = Directory()
436 sitemap = Sitemap()
437
438 dir_.scan()
439 sitemap.read_map()
440
441 missing = dir_.set() - sitemap.set()
442 removed = sitemap.set() - dir_.set()
443 for page in removed:
444     print removed+' pages missing!!'
445
446 for page in missing:
447     print 'adding missing page '+page
448     sitemap.add_page(page)
449
450 sitemap.graph()
451
452
453 sitemap = generateSitemap()
454 tmptarget = tempfile.mkdtemp()+'/'
455 for page in sitemap:
456     t1 = time.time()
457     print "Page : %-30s %30s" % (page['link'],
458                         time.ctime(os.stat(page['file']).st_mtime)),
459     doc = expandXincludeTxt(page)
460     pubdoc = xsltConvert(doc)
461     writeToTemplate(page,pubdoc,sitemap)
462     t2 = time.time()
463     print "[%5.2f s]" % (round(t2-t1,2))
464
465 createSitemap(sitemap)
466 publish(tmptarget, args.output)
467 publish(args.style+"css", args.output)
468 publish(args.style+"images",args.output)