Adding initial support for html forms.
[treecutter.git] / src / tree-cutter.py
1 #!/usr/bin/python
2 import os
3 import fnmatch
4 import subprocess
5 import amara
6 import re
7 import tempfile
8 import errno
9 import time
10 import argparse
11 import shutil
12 import pygraphviz as pgv
13 import glob
14 import gettext
15 import shutil
16 from amara import bindery
17 from amara.xslt import transform
18 from Cheetah.Template import Template
19
20 parser = argparse.ArgumentParser(description='Process docbook article tree.')
21 parser.add_argument('--style', nargs='?',
22                     default=os.path.dirname(os.getcwd())+'/style/default/')
23 parser.add_argument('--output', nargs='?',
24                     default=os.path.dirname(os.getcwd())+'/htdocs/')
25 args = parser.parse_args()
26
27 style_xslt = args.style+"docbook.xsl"
28 outputdir = args.output
29
30 tmptarget = tempfile.mkdtemp()+'/'
31
32 valid_scripts = ['.py','.pl']
33 MAXLEVEL = 10000
34
35 def mkdir_p(path):
36     try:
37         os.makedirs(path)
38     except OSError as exc: # Python >2.5
39         if exc.errno == errno.EEXIST:
40             pass
41         else: raise
42
43 def publish(src,target):
44     cmd = ["rsync","-a","--delete",src,target]
45     retcode = subprocess.call(cmd)
46     if retcode:
47         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
48
49
50 PREFIXES={u'db': u'http://docbook.org/ns/docbook',
51           u'xi': u'http://www.w3.org/2001/XInclude',
52           u'xl': u'http://www.w3.org/1999/xlink',
53           u'html' : u'http://www.w3.org/1999/xhtml'}
54
55 class Directory():
56     """Class containing the state of the directory with articles"""
57     def __init__(self):
58         self._cwd = '.'
59         self._tree = []
60
61     def scan(self):
62         for dirname, dirnames, filenames in os.walk(self._cwd):
63             for filename in filenames:
64                 if fnmatch.fnmatch(filename, '*.xml'):
65                     file_ = os.path.join(dirname,filename)
66                     doc = bindery.parse(file_, prefixes=PREFIXES)
67                     title = doc.xml_select(u'/db:article/db:info/db:title')
68                     menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
69                     if title and menu:
70                         base = file_.split('.')[1]
71                         link = base.replace('index','')
72                         self._tree.append(link)
73
74     def set(self):
75         return set(self._tree)
76
77 class Page():
78     """Class representing a version of a webpage"""
79     def __init__(self,link,page):
80         self._link = link
81         self._file = page[1]
82         self._lang = page[0]
83         self._doc = None
84         self._resources = []
85         self._title = None
86         self._menu = None
87         self._rendered_article = None
88
89     def language(self):
90         return self._lang
91
92     def resources(self):
93         return set(self._resources)
94
95     def menu(self):
96         return self._menu
97
98     def set_article(self,art):
99         self._rendered_article = art
100
101     def prepare(self):
102         self._doc = bindery.parse(self._file, prefixes=PREFIXES)
103         if self._doc.xml_select(u'/db:article/db:info/db:title'):
104             self._title = unicode(self._doc.article.info.title)
105         if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'):
106             self._menu = unicode(self._doc.article.info.titleabbrev)
107
108         dirname = os.path.dirname(self._file)
109         code  = self._doc.xml_select(u"//xi:include[@parse='text']")
110         if code:
111             for c in code:
112                 (p, ext) = os.path.splitext(c.href)
113                 if ext in valid_scripts:
114                     exe = os.path.join(os.path.abspath(dirname)+'/'+c.href)
115                     xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
116                     xstr = bindery.parse(str(xml.stdout.read()))
117                     idp = c.xml_index_on_parent
118                     for x in xstr.xml_children:
119                         c.xml_parent.xml_insert(idp,x)
120                         c.xml_parent.xml_remove(c)
121
122         for r in self._doc.xml_select(u"//db:link[@xl:href]"):
123             rf = os.path.join(dirname,r.href)
124             if os.path.isfile(rf):
125                 self._resources.append(rf)
126         for i in self._doc.xml_select(u"//db:imagedata[@fileref]"):
127             im = os.path.join(dirname,i.fileref)
128             if os.path.isfile(im):
129                 self._resources.append(im)
130         for i in self._doc.xml_select(u"//html:form[@action]"):
131             pyscript = re.split('\.py',i.action,1)[0]+'.py'
132             im = os.path.join(dirname,pyscript)
133             if os.path.isfile(im):
134                 self._resources.append(im)
135
136     def render(self):
137         #  amara can not handle the docbook stylesheets
138         #  xmlarticle = transform(doc,style_xslt)
139         cwd = os.getcwd()
140         dirname = os.path.dirname(self._file)
141         os.chdir(dirname)
142         infile  = os.path.basename(tempfile.mktemp())
143         outfile = tempfile.mktemp()
144         tfi = open(infile,'w')
145         tfi.write(self._doc.xml_encode())
146         tfi.close()
147 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
148         cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
149         retcode = subprocess.call(cmd)
150         if retcode:
151             print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
152         tfo = open(outfile,'r')
153         self._rendered_article = tfo.read()
154         tfo.close()
155         os.remove(infile)
156         os.remove(outfile)
157         os.chdir(cwd)
158
159     def template(self,sitemap):
160         htmlmenu =  sitemap.gen_menu(self._lang,None,"menu")
161         levelmenu = sitemap.gen_menu(self._lang,self,"tree")
162         langmenu = sitemap.lang_menu(self._lang,self._link)
163         template = Template(file=args.style+'index.'+self._lang+'.html.tmpl',
164                             searchList=[{'title':self._title},
165                                         {'menu':htmlmenu},
166                                         {'article':self._rendered_article},
167                                         {'levelmenu':levelmenu},
168                                         {'langmenu':langmenu}])
169         outfile = tmptarget+'html'.join(self._file.rsplit('xml',1))
170         mkdir_p(os.path.dirname(outfile))
171         out = open(outfile, 'w')
172         out.write(str(template))
173         out.close()
174
175
176 class Link():
177     """Class representing a webpage on the site"""
178     def __init__(self,link):
179         self._link = link
180         # find the representations of the link.
181         self._pages = []
182         path = link
183         if self._link[-1] == '/':
184             path = path+'index'
185         lang = self._scan_languages(path)
186         for l in lang:
187             self._pages.append(Page(self,l))
188
189     def add_page(self,l):
190         self._pages.append(Page(self,l))
191
192     def _scan_languages(self,path):
193         lang = []
194         for l in  glob.glob('.'+path+'*'):
195             ls = l.split('.')
196             if len(ls) > 3 and ls[3] == 'xml':
197                 lang.append((ls[2],l))
198         return lang
199
200     def link(self):
201         return self._link
202
203     def prepare(self):
204         for page in self._pages:
205             page.prepare()
206
207     def languages(self):
208         p = []
209         for page in self._pages:
210             p.append(page.language())
211         return p
212
213     def render(self):
214         for page in self._pages:
215             page.render()
216
217     def template(self,sitemap):
218         for page in self._pages:
219             page.template(sitemap)
220
221     def page(self,lang):
222         for page in self._pages:
223             if page.language()==lang:
224                 return page
225         return None
226
227     def resources(self):
228         res  = set()
229         for page in self._pages:
230             res = res.union(page.resources())
231         return res
232
233
234 class Node():
235     def __init__(self,token,value):
236         self._token = token
237         self._value = value
238         self._children = []
239
240     def token(self):
241         return self._token
242
243     def value(self):
244         return self._value
245
246     def children(self):
247         return self._children
248
249 class Trie():
250     def __init__(self):
251         self._root = []
252
253     def __iter__(self):
254         return self.inorder(self._root)
255
256     def inorder(self,t):
257         for l in t:
258             yield l.value()
259             for x in self.inorder(l.children()):
260                 yield x
261
262     def _add(self,trie, key, content):
263         # is the key a leaf
264         k = key.pop(0)
265         if key == []:
266             node = Node(k,content)
267             trie.append(node)
268         else:
269             for ch in trie:
270                 if ch.token() == k:
271                     self._add(ch.children(), key, content)
272
273     def add(self,key, content):
274         self._add(self._root, key, content)
275
276     def _graph(self, trie, G):
277         for l in trie:
278             G.add_node(l.token())
279             for ch in l.children():
280                 G.add_edge(l.token(),ch.token())
281                 self._graph(l.children(), G)
282
283     def graph(self):
284         G = pgv.AGraph(directed=True)
285         G.add_node("sitemap")
286         for ch in self._root:
287             G.add_edge("sitemap",ch.token())
288         self._graph(self._root, G)
289 #        G.layout('dot')
290 #        G.draw('g.png')
291 #        print G.string()
292
293     def _menu(self, trie, lang, page, css):
294         html = "<ul%s>\n" % css
295         for l in trie:
296             sel = ''
297             p = l.value().page(lang)
298             if p == page:
299                 sel = ' class="selected"'
300             if p != None:
301                 html += '<li%s><a href="%s">%s</a>\n' \
302                     % (sel,l.value().link(),p.menu())
303             else:
304                 html += '<li%s><a href="%s.en" hreflang="en">%s</a>*\n' \
305                     % (sel,l.value().link(), l.value().page('en').menu())
306             if l.children():
307                 html += self._menu(l.children(), lang, page, "")
308         html += "</ul>\n"
309         return html
310
311     def menu(self,lang,page,cssclass):
312         css = ''
313         if cssclass:
314             css = ' class="'+cssclass+'"'
315         return self._menu(self._root, lang, page, css)
316
317 class Sitemap():
318     """Class keeping the internal site structure"""
319     def __init__(self):
320         self._file = 'sitemap.txt'
321         self._tree = Trie()
322         self._sitelang = set()
323         self._isocode = bindery.parse('/usr/share/xml/iso-codes/iso_639_3.xml')
324         self._tranlang = {}
325
326     def add_link(self, link):
327         tokens = filter(None,re.split(r'(^/[\w-]*/|[\w-]*/)',link))
328         self._tree.add(tokens,Link(link))
329
330     def write_map(self):
331         f = open(self._file,'w')
332         f.write('\n'.join(link.link() for link in self._tree))
333         f.close()
334
335     def read_map(self):
336         try:
337             f = open(self._file)
338             sml = f.read().split()
339             f.close()
340             for line in sml:
341                 self.add_link(line)
342         except IOError, what_error:
343             print 'INFO: Could not read sitemap.txt - one will be created'
344
345     def set(self):
346         return set(link.link() for link in self._tree)
347
348     def process(self):
349         t1 = time.time()
350         for link in self._tree:
351             link.prepare()
352         t2 = time.time()
353         print "Prepare  [%5.2f s]" % (round(t2-t1,2))
354         for link in self._tree:
355             self._sitelang = self._sitelang.union(set(link.languages()))
356         for tran in self._sitelang:
357             if tran != 'en':
358                 self._tranlang[tran] = gettext.translation('iso_639_3',
359                                                            languages=[tran])
360         t3 = time.time()
361         print "Language [%5.2f s]" % (round(t3-t2,2))
362         for link in self._tree:
363             link.render()
364         t4 = time.time()
365         print "Render   [%5.2f s]" % (round(t4-t3,2))
366         for link in self._tree:
367             link.template(self)
368         t5 = time.time()
369         print "Template [%5.2f s]" % (round(t5-t4,2))
370         t6 = time.time()
371         res = set()
372         cwd = os.getcwd()
373         for link in self._tree:
374             res = res.union(link.resources())
375         for f in res:
376             outfile = tmptarget+f
377             mkdir_p(os.path.dirname(outfile))
378             shutil.copyfile(f,outfile)
379         print "Resources[%5.2f s]" % (round(t6-t5,2))
380         sitmaplink = Link('/sitemap')
381         for l in self._sitelang:
382             sitmaplink.add_page((l,'/sitemap.'+l+'.xml'))
383         for l in self._sitelang:
384             sitmaplink.page(l).set_article(self.gen_menu(l,None,"tree sitemap"))
385             sitmaplink.page(l).template(self)
386         t7 = time.time()
387         print "Sitemap  [%5.2f s]" % (round(t7-t6,2))
388
389     def graph(self):
390         self._tree.graph()
391
392     def gen_menu(self,lang,page,cssclass):
393         return self._tree.menu(lang,page,cssclass)
394
395     def lang_menu(self,lang,link):
396         html = "<ul>"
397         for l in link.languages():
398             isoxml = u"//iso_639_3_entry[@*='"+l+"']"
399             ln = self._isocode.xml_select(isoxml)[0].name
400             if lang != 'en':
401                 ln = self._tranlang[lang].gettext(ln)
402             p = link.link()
403             if p[-1] == '/':
404                 p = p +'index'
405             p = p+'.'+l
406             html += '<li><a href="%s" hreflang="%s">%s</a></li>' % (p, l, ln)
407         html += "</ul>"
408         return html
409
410     def publish(self):
411         publish(tmptarget, args.output)
412         publish(args.style+"css", args.output)
413         publish(args.style+"images",args.output)
414
415 ts = time.time()
416 dir_ = Directory()
417 sitemap = Sitemap()
418
419 dir_.scan()
420 sitemap.read_map()
421
422 missing = dir_.set() - sitemap.set()
423 removed = sitemap.set() - dir_.set()
424 for page in removed:
425     print page+' pages missing!!'
426 for page in missing:
427     print 'adding missing page '+page
428     sitemap.add_link(page)
429 if len(missing)+len(removed) != 0:
430     print 'writing new sitemap - please adjust if needed'
431     sitemap.write_map()
432 sitemap.graph()
433
434 sitemap.process()
435
436 t1 = time.time()
437 sitemap.publish()
438 t2 = time.time()
439 print "Publish  [%5.2f s]" % (round(t2-t1,2))
440 print "Total    [%5.2f s]" % (round(t2-ts,2))