Moving the creation of tmp directory to before the new Class structure.
[treecutter.git] / src / tree-cutter.py
1 #!/usr/bin/python
2 import os
3 import fnmatch
4 import subprocess
5 import amara
6 import re
7 import tempfile
8 import errno
9 import time
10 import argparse
11 import shutil
12 import pygraphviz as pgv
13 import glob
14 from amara import bindery
15 from amara.xslt import transform
16 from Cheetah.Template import Template
17
18 parser = argparse.ArgumentParser(description='Process docbook article tree.')
19 parser.add_argument('--style', nargs='?',
20                     default=os.path.dirname(os.getcwd())+'/style/default/')
21 parser.add_argument('--output', nargs='?',
22                     default=os.path.dirname(os.getcwd())+'/htdocs/')
23 args = parser.parse_args()
24
25 style_xslt = args.style+"docbook.xsl"
26 style_tmpl = args.style+"index.en.html.tmpl"
27 outputdir = args.output
28
29 tmptarget = tempfile.mkdtemp()+'/'
30
31 valid_scripts = ['.py','.pl']
32 MAXLEVEL = 10000
33
34 def mkdir_p(path):
35     try:
36         os.makedirs(path)
37     except OSError as exc: # Python >2.5
38         if exc.errno == errno.EEXIST:
39             pass
40         else: raise
41
42 def publish(src,target):
43     cmd = ["rsync","-a","--delete",src,target]
44     retcode = subprocess.call(cmd)
45     if retcode:
46         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
47
48
49 PREFIXES={u'db': u'http://docbook.org/ns/docbook',
50           u'xi': u'http://www.w3.org/2001/XInclude',
51           u'xl': u'http://www.w3.org/1999/xlink'}
52
53 class Directory():
54     """Class containing the state of the directory with articles"""
55     def __init__(self):
56         self._cwd = '.'
57         self._tree = []
58
59     def scan(self):
60         for dirname, dirnames, filenames in os.walk(self._cwd):
61             for filename in filenames:
62                 if fnmatch.fnmatch(filename, '*.xml'):
63                     file_ = os.path.join(dirname,filename)
64                     doc = bindery.parse(file_, prefixes=PREFIXES)
65                     title = doc.xml_select(u'/db:article/db:info/db:title')
66                     menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
67                     if title and menu:
68                         base = file_.split('.')[1]
69                         link = base.replace('index','')
70                         self._tree.append(link)
71
72     def set(self):
73         return set(self._tree)
74
75 class Page():
76     """Class representing a version of a webpage"""
77     def __init__(self,page):
78         self._file = page[1]
79         self._lang = page[0]
80         self._doc = None
81         self._resources = []
82         self._title = None
83         self._menu = None
84         self._rendered_article = None
85
86     def prepare(self):
87         self._doc = bindery.parse(self._file, prefixes=PREFIXES)
88         if self._doc.xml_select(u'/db:article/db:info/db:title'):
89             self._title = unicode(self._doc.article.info.title)
90         if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'):
91             self._menu = unicode(self._doc.article.info.titleabbrev)
92
93         dirname = os.path.dirname(self._file)
94         code  = self._doc.xml_select(u"//xi:include[@parse='text']")
95         if code:
96             for c in code:
97                 (p, ext) = os.path.splitext(c.href)
98                 if ext in valid_scripts:
99                     exe = os.path.join(os.path.abspath(dirname+c.href))
100                     xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
101                     xstr = bindery.parse(str(xml.stdout.read()))
102                     idp = c.xml_index_on_parent
103                     for x in xstr.xml_children:
104                         c.xml_parent.xml_insert(idp,x)
105                         c.xml_parent.xml_remove(c)
106
107         for r in self._doc.xml_select(u"//db:link[@xl:href]"):
108             rf = os.path.join(dirname,r.href)
109             if os.path.isfile(rf):
110                 self._resources.append(rf)
111         for i in self._doc.xml_select(u"//db:imagedata[@fileref]"):
112             im = os.path.join(dirname,i.fileref)
113             if os.path.isfile(im):
114                 self._resources.append(im)
115
116     def render(self):
117         #  amara can not handle the docbook stylesheets
118         #  xmlarticle = transform(doc,style_xslt)
119         cwd = os.getcwd()
120         dirname = os.path.dirname(self._file)
121         os.chdir(dirname)
122         infile  = os.path.basename(tempfile.mktemp())
123         outfile = tempfile.mktemp()
124         tfi = open(infile,'w')
125         tfi.write(self._doc.xml_encode())
126         tfi.close()
127 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
128         cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
129         retcode = subprocess.call(cmd)
130         if retcode:
131             print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
132         tfo = open(outfile,'r')
133         self._rendered_article = tfo.read()
134         tfo.close()
135         os.remove(infile)
136         os.remove(outfile)
137         os.chdir(cwd)
138
139     def template(self,sitemap):
140         htmlmenu =  sitemap.gen_menu(self._lang,None)
141         levelmenu = sitemap.gen_menu(self._lang,self)
142         template = Template(file=style_tmpl,
143                             searchList=[{'title':self._title},
144                                         {'menu':htmlmenu},
145                                         {'article':self._rendered_article},
146                                         {'levelmenu':levelmenu},
147                                         {'levelname':levelname}])
148         outfile = tmptarget+self._file+'.'+self._lang+'.html'
149         mkdir_p(os.path.dirname(outfile))
150         out = open(outfile, 'w')
151         out.write(str(template))
152         out.close()
153
154
155 class Link():
156     """Class representing a webpage on the site"""
157     def __init__(self,link):
158         self._link = link
159         # find the representations of the link.
160         self._pages = []
161         path = link
162         if self._link[-1] == '/':
163             path = path+'index'
164         lang = self._scan_languages(path)
165         for l in lang:
166             self._pages.append(Page(l))
167
168     def _scan_languages(self,path):
169         lang = []
170         for l in  glob.glob('.'+path+'*'):
171             ls = l.split('.')
172             if len(ls) > 3 and ls[3] == 'xml':
173                 lang.append((ls[2],l))
174         return lang
175
176     def link(self):
177         return self._link
178
179 class Node():
180     def __init__(self,token,value):
181         self._token = token
182         self._value = value
183         self._children = []
184
185     def token(self):
186         return self._token
187
188     def value(self):
189         return self._value
190
191     def children(self):
192         return self._children
193
194 class Trie():
195     def __init__(self):
196         self._root = []
197
198     def __iter__(self):
199         return self.inorder(self._root)
200
201     def inorder(self,t):
202         for l in t:
203             yield l.value()
204             for ch in l.children():
205                 self.inorder(ch)
206
207     def _add(self,trie, key, content):
208         # is the key a leaf
209         k = key.pop(0)
210         if key == []:
211             node = Node(k,content)
212             trie.append(node)
213         else:
214             for ch in trie:
215                 if ch.token() == k:
216                     self._add(ch.children(), key, content)
217
218     def add(self,key, content):
219         self._add(self._root, key, content)
220
221     def _graph(self, trie, G):
222         for l in trie:
223             G.add_node(l.token())
224             for ch in l.children():
225                 G.add_edge(l.token(),ch.token())
226                 self._graph(l.children(), G)
227
228     def graph(self):
229         G = pgv.AGraph(directed=True)
230         G.add_node("sitemap")
231         for ch in self._root:
232             G.add_edge("sitemap",ch.token())
233         self._graph(self._root, G)
234 #        G.layout('dot')
235 #        G.draw('g.png')
236 #        print G.string()
237
238 class Sitemap():
239     """Class keeping the internal site structure"""
240     def __init__(self):
241         self._file = 'sitemap.txt'
242         self._tree = Trie()
243
244     def add_link(self, link):
245         tokens = filter(None,re.split(r'(^/\w*/|\w*/)',link))
246         self._tree.add(tokens,Link(link))
247
248     def read_map(self):
249         try:
250             f = open(self._file)
251             sml = f.read().split()
252             f.close()
253             for line in sml:
254                 self.add_link(line)
255         except IOError, what_error:
256             print 'INFO: Could not read sitemap.txt - one will be created'
257
258     def set(self):
259         return set(link.link() for link in self._tree)
260
261     def graph(self):
262         self._tree.graph()
263
264     def gen_menu(self,lang,page):
265         return 'Generate menu from sitemap - To be implemented'
266
267 def generateSitemap():
268     sitemap = []
269     try:
270         sfile = open('sitemap.txt')
271         flist = sfile.read().split()
272         sfile.close()
273         for f in flist:
274             sitemap.append(dict(link=f))
275     except IOError, what_error:
276         print 'Sitemap missing - generating one.'
277
278     for dirname, dirnames, filenames in os.walk('.'):
279         for filename in filenames:
280             if fnmatch.fnmatch(filename, '*.xml'):
281                 xfile = os.path.join(dirname,filename)
282                 doc = bindery.parse(xfile,
283                                     prefixes={u'db': u'http://docbook.org/ns/docbook',
284                                               u'xi': u'http://www.w3.org/2001/XInclude',
285                                               u'xl': u'http://www.w3.org/1999/xlink'})
286                 title = doc.xml_select(u'/db:article/db:info/db:title')
287                 menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
288                 code  = doc.xml_select(u"//xi:include[@parse='text']")
289                 resource = doc.xml_select(u"//db:link[@xl:href]")
290                 image = doc.xml_select(u"//db:imagedata[@fileref]")
291                 exe = 0
292                 for c in code:
293                     (p, ext) = os.path.splitext(c.href)
294                     if ext in valid_scripts:
295                         exe = 1
296
297                 if title and menu:
298                     found = 0
299                     base = xfile.split('.')[1]
300                     link = base.replace('index','')
301                     level = len(filter(None,re.split(r'(^/\w*/|\w*/)',link)))
302                     res = []
303                     for r in resource:
304                         rf = os.path.join(dirname,r.href)
305                         if os.path.isfile(rf):
306                             res.append(rf)
307                     for i in image:
308                         im = os.path.join(dirname,i.fileref)
309                         if os.path.isfile(im):
310                             res.append(im)
311                     page = dict(title=unicode(doc.article.info.title),
312                                 menu=unicode(doc.article.info.titleabbrev),
313                                 output=os.path.join(dirname,
314                                                     filename.replace('xml','html')),
315                                 exe=exe,
316                                 file=xfile,
317                                 res=res,
318                                 level=level)
319                     for l in sitemap:
320                         if l['link'] == link:
321                             found = 1
322                             l.update(page)
323                     if not found:
324                         print "adding "+link+" to sitemap"
325                         dd = dict(link=link)
326                         dd.update(page)
327                         sitemap.append(dd)
328     sfile = open('sitemap.txt','w')
329     for l in sitemap:
330         sfile.write(l['link']+'\n')
331     sfile.close()
332     return sitemap
333
334 def expandXincludeTxt(page):
335     doc = bindery.parse(page['file'],
336                         prefixes={u'db': u'http://docbook.org/ns/docbook',
337                                   u'xi': u'http://www.w3.org/2001/XInclude'})
338     if page['exe']:
339         code  = doc.xml_select(u"//xi:include[@parse='text']")
340         for c in code:
341             (p, ext) = os.path.splitext(c.href)
342             if ext in valid_scripts:
343                 exe = os.path.join(os.path.abspath(c.href))
344                 xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
345                 xstr = bindery.parse(str(xml.stdout.read()))
346                 id = c.xml_index_on_parent
347                 for x in xstr.xml_children:
348                     c.xml_parent.xml_insert(id,x)
349                 c.xml_parent.xml_remove(c)
350     return doc
351
352 def xsltConvert(doc):
353 #  amara can not handle the docbook stylesheets
354 #  xmlarticle = transform(doc,style_xslt)
355     cwd = os.getcwd()
356     rundir = os.path.dirname(page['file'])
357     os.chdir(rundir)
358     infile  = os.path.basename(tempfile.mktemp())
359     outfile = tempfile.mktemp()
360     tfi = open(infile,'w')
361     tfi.write(doc.xml_encode())
362     tfi.close()
363 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
364     cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
365     retcode = subprocess.call(cmd)
366     if retcode:
367         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
368     tfo = open(outfile,'r')
369     result = tfo.read()
370     tfo.close()
371     os.remove(infile)
372     os.remove(outfile)
373     os.chdir(cwd)
374     return result
375
376 def genMenu(page,sitemap,slevel,elevel):
377     title = None
378     sm = []
379     if elevel == MAXLEVEL or elevel == 1 or page == None:
380         html = '<ul>\n'
381         sm = sitemap
382     else:
383         html = '<ul class="tree">\n'
384         idx = sitemap.index(page)
385         while (sitemap[idx]['level'] == page['level']):
386             idx = idx-1
387         title = sitemap[idx]['menu']
388         idx = idx+1
389         while (idx < len(sitemap) and sitemap[idx]['level'] == page['level']):
390             sm.append(sitemap[idx])
391             idx = idx+1
392     oldlevel = slevel
393
394     for p in sm:
395         if slevel > p['level'] or elevel < p['level']:
396             continue
397         if not title and p['link'] == '/':
398             title = p['menu']
399
400         if oldlevel < p['level']:
401             html+='<ul>\n'
402         elif oldlevel > p['level']:
403             if p['link'][-1] == '/':
404                 html+='</li>\n'
405             html+='</ul>\n</li>\n'
406         if page != None and page == p:
407             html+='<li class="selected"><a href="%s">%s</a>' % (p['link'],p['menu'])
408         else:
409             html+='<li><a href="%s">%s</a>' % (p['link'],p['menu'])
410         if p['link'][-1] != '/' or p['link'] == '/':
411             html+='</li>\n'
412         oldlevel = p['level']
413     html+='</ul>\n'
414     return (html,title)
415
416 def writeToTemplate(page,doc,sitemap):
417     (menu,menuname) = genMenu(page,sitemap,1,MAXLEVEL)
418     (levelmenu,levelname) = genMenu(page,sitemap,page['level'],page['level'])
419     template = Template(file=style_tmpl,
420                         searchList=[{'title':page['title']},
421                                     {'menu':menu},
422                                     {'article':doc},
423                                     {'levelmenu':levelmenu},
424                                     {'levelname':levelname}])
425     outfile = tmptarget+page['output']
426     mkdir_p(os.path.dirname(outfile))
427     out = open(outfile, 'w')
428     out.write(str(template))
429     out.close()
430     for r in page['res']:
431         mkdir_p(os.path.dirname(tmptarget+r))
432         shutil.copyfile(r, tmptarget+r)
433
434 def createSitemap(sitemap):
435     (menu,menuname) = genMenu(None,sitemap,1,MAXLEVEL)
436     template = Template(file=style_tmpl,
437                         searchList=[
438             {'title':'Sitemap'},
439             {'menu':menu},
440             {'article':menu},
441             {'levelmenu':''},
442             {'levelname':''}])
443     outfile = tmptarget+'sitemap.en.html'
444     mkdir_p(os.path.dirname(outfile))
445     out = open(outfile, 'w')
446     out.write(str(template))
447     out.close()
448
449 dir_ = Directory()
450 sitemap = Sitemap()
451
452 dir_.scan()
453 sitemap.read_map()
454
455 missing = dir_.set() - sitemap.set()
456 removed = sitemap.set() - dir_.set()
457 for page in removed:
458     print removed+' pages missing!!'
459
460 for page in missing:
461     print 'adding missing page '+page
462     sitemap.add_page(page)
463
464 sitemap.graph()
465
466
467 sitemap = generateSitemap()
468 tmptarget = tempfile.mkdtemp()+'/'
469 for page in sitemap:
470     t1 = time.time()
471     print "Page : %-30s %30s" % (page['link'],
472                         time.ctime(os.stat(page['file']).st_mtime)),
473     doc = expandXincludeTxt(page)
474     pubdoc = xsltConvert(doc)
475     writeToTemplate(page,pubdoc,sitemap)
476     t2 = time.time()
477     print "[%5.2f s]" % (round(t2-t1,2))
478
479 createSitemap(sitemap)
480 publish(tmptarget, args.output)
481 publish(args.style+"css", args.output)
482 publish(args.style+"images",args.output)