Adding mapping Sitemap -> Link -> Page to deal with Content-Language
[treecutter.git] / src / tree-cutter.py
1 #!/usr/bin/python
2 import os
3 import fnmatch
4 import subprocess
5 import amara
6 import re
7 import tempfile
8 import errno
9 import time
10 import argparse
11 import shutil
12 import pygraphviz as pgv
13 import glob
14 from amara import bindery
15 from amara.xslt import transform
16 from Cheetah.Template import Template
17
18 parser = argparse.ArgumentParser(description='Process docbook article tree.')
19 parser.add_argument('--style', nargs='?',
20                     default=os.path.dirname(os.getcwd())+'/style/default/')
21 parser.add_argument('--output', nargs='?',
22                     default=os.path.dirname(os.getcwd())+'/htdocs/')
23 args = parser.parse_args()
24
25 style_xslt = args.style+"docbook.xsl"
26 style_tmpl = args.style+"index.en.html.tmpl"
27 outputdir = args.output
28
29 valid_scripts = ['.py','.pl']
30 MAXLEVEL = 10000
31
32 def mkdir_p(path):
33     try:
34         os.makedirs(path)
35     except OSError as exc: # Python >2.5
36         if exc.errno == errno.EEXIST:
37             pass
38         else: raise
39
40 def publish(src,target):
41     cmd = ["rsync","-a","--delete",src,target]
42     retcode = subprocess.call(cmd)
43     if retcode:
44         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
45
46
47 PREFIXES={u'db': u'http://docbook.org/ns/docbook',
48           u'xi': u'http://www.w3.org/2001/XInclude',
49           u'xl': u'http://www.w3.org/1999/xlink'}
50
51 class Directory():
52     """Class containing the state of the directory with articles"""
53     def __init__(self):
54         self._cwd = '.'
55         self._tree = []
56
57     def scan(self):
58         for dirname, dirnames, filenames in os.walk(self._cwd):
59             for filename in filenames:
60                 if fnmatch.fnmatch(filename, '*.xml'):
61                     file_ = os.path.join(dirname,filename)
62                     doc = bindery.parse(file_, prefixes=PREFIXES)
63                     title = doc.xml_select(u'/db:article/db:info/db:title')
64                     menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
65                     if title and menu:
66                         base = file_.split('.')[1]
67                         link = base.replace('index','')
68                         self._tree.append(link)
69
70     def set(self):
71         return set(self._tree)
72
73 class Page():
74     """Class representing a version of a webpage"""
75     def __init__(self,page):
76         self._file = page[1]
77         self._lang = page[0]
78         self._doc = None
79         self._resources = []
80         self._title = None
81         self._menu = None
82         self._rendered_article = None
83
84 class Link():
85     """Class representing a webpage on the site"""
86     def __init__(self,link):
87         self._link = link
88         # find the representations of the link.
89         self._pages = []
90         path = link
91         if self._link[-1] == '/':
92             path = path+'index'
93         lang = self._scan_languages(path)
94         for l in lang:
95             self._pages.append(Page(l))
96
97     def _scan_languages(self,path):
98         lang = []
99         for l in  glob.glob('.'+path+'*'):
100             ls = l.split('.')
101             if len(ls) > 3 and ls[3] == 'xml':
102                 lang.append((ls[2],l))
103         return lang
104
105     def link(self):
106         return self._link
107
108 class Node():
109     def __init__(self,token,value):
110         self._token = token
111         self._value = value
112         self._children = []
113
114     def token(self):
115         return self._token
116
117     def value(self):
118         return self._value
119
120     def children(self):
121         return self._children
122
123 class Trie():
124     def __init__(self):
125         self._root = []
126
127     def _add(self,trie, key, content):
128         # is the key a leaf
129         k = key.pop(0)
130         if key == []:
131             node = Node(k,content)
132             trie.append(node)
133         else:
134             for ch in trie:
135                 if ch.token() == k:
136                     self._add(ch.children(), key, content)
137
138     def add(self,key, content):
139         self._add(self._root, key, content)
140
141     def _graph(self, trie, G):
142         for l in trie:
143             G.add_node(l.token())
144             for ch in l.children():
145                 G.add_edge(l.token(),ch.token())
146                 self._graph(l.children(), G)
147
148     def graph(self):
149         G = pgv.AGraph(directed=True)
150         G.add_node("sitemap")
151         for ch in self._root:
152             G.add_edge("sitemap",ch.token())
153         self._graph(self._root, G)
154 #        G.layout('dot')
155 #        G.draw('g.png')
156         print G.string()
157
158 class Sitemap():
159     """Class keeping the internal site structure"""
160     def __init__(self):
161         self._file = 'sitemap.txt'
162         self._tree = Trie()
163
164     def add_link(self, link):
165         tokens = filter(None,re.split(r'(^/\w*/|\w*/)',link))
166         self._tree.add(tokens,Link(link))
167
168     def read_map(self):
169         try:
170             f = open(self._file)
171             sml = f.read().split()
172             f.close()
173             for line in sml:
174                 self.add_link(line)
175         except IOError, what_error:
176             print 'INFO: Could not read sitemap.txt - one will be created'
177
178     def set(self):
179         return set(link.link() for link in self._tree)
180
181     def graph(self):
182         self._tree.graph()
183
184 def generateSitemap():
185     sitemap = []
186     try:
187         sfile = open('sitemap.txt')
188         flist = sfile.read().split()
189         sfile.close()
190         for f in flist:
191             sitemap.append(dict(link=f))
192     except IOError, what_error:
193         print 'Sitemap missing - generating one.'
194
195     for dirname, dirnames, filenames in os.walk('.'):
196         for filename in filenames:
197             if fnmatch.fnmatch(filename, '*.xml'):
198                 xfile = os.path.join(dirname,filename)
199                 doc = bindery.parse(xfile,
200                                     prefixes={u'db': u'http://docbook.org/ns/docbook',
201                                               u'xi': u'http://www.w3.org/2001/XInclude',
202                                               u'xl': u'http://www.w3.org/1999/xlink'})
203                 title = doc.xml_select(u'/db:article/db:info/db:title')
204                 menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
205                 code  = doc.xml_select(u"//xi:include[@parse='text']")
206                 resource = doc.xml_select(u"//db:link[@xl:href]")
207                 image = doc.xml_select(u"//db:imagedata[@fileref]")
208                 exe = 0
209                 for c in code:
210                     (p, ext) = os.path.splitext(c.href)
211                     if ext in valid_scripts:
212                         exe = 1
213
214                 if title and menu:
215                     found = 0
216                     base = xfile.split('.')[1]
217                     link = base.replace('index','')
218                     level = len(filter(None,re.split(r'(^/\w*/|\w*/)',link)))
219                     res = []
220                     for r in resource:
221                         rf = os.path.join(dirname,r.href)
222                         if os.path.isfile(rf):
223                             res.append(rf)
224                     for i in image:
225                         im = os.path.join(dirname,i.fileref)
226                         if os.path.isfile(im):
227                             res.append(im)
228                     page = dict(title=unicode(doc.article.info.title),
229                                 menu=unicode(doc.article.info.titleabbrev),
230                                 output=os.path.join(dirname,
231                                                     filename.replace('xml','html')),
232                                 exe=exe,
233                                 file=xfile,
234                                 res=res,
235                                 level=level)
236                     for l in sitemap:
237                         if l['link'] == link:
238                             found = 1
239                             l.update(page)
240                     if not found:
241                         print "adding "+link+" to sitemap"
242                         dd = dict(link=link)
243                         dd.update(page)
244                         sitemap.append(dd)
245     sfile = open('sitemap.txt','w')
246     for l in sitemap:
247         sfile.write(l['link']+'\n')
248     sfile.close()
249     return sitemap
250
251 def expandXincludeTxt(page):
252     doc = bindery.parse(page['file'],
253                         prefixes={u'db': u'http://docbook.org/ns/docbook',
254                                   u'xi': u'http://www.w3.org/2001/XInclude'})
255     if page['exe']:
256         code  = doc.xml_select(u"//xi:include[@parse='text']")
257         for c in code:
258             (p, ext) = os.path.splitext(c.href)
259             if ext in valid_scripts:
260                 exe = os.path.join(os.path.abspath(c.href))
261                 xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
262                 xstr = bindery.parse(str(xml.stdout.read()))
263                 id = c.xml_index_on_parent
264                 for x in xstr.xml_children:
265                     c.xml_parent.xml_insert(id,x)
266                 c.xml_parent.xml_remove(c)
267     return doc
268
269 def xsltConvert(doc):
270 #  amara can not handle the docbook stylesheets
271 #  xmlarticle = transform(doc,style_xslt)
272     cwd = os.getcwd()
273     rundir = os.path.dirname(page['file'])
274     os.chdir(rundir)
275     infile  = os.path.basename(tempfile.mktemp())
276     outfile = tempfile.mktemp()
277     tfi = open(infile,'w')
278     tfi.write(doc.xml_encode())
279     tfi.close()
280 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
281     cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
282     retcode = subprocess.call(cmd)
283     if retcode:
284         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
285     tfo = open(outfile,'r')
286     result = tfo.read()
287     tfo.close()
288     os.remove(infile)
289     os.remove(outfile)
290     os.chdir(cwd)
291     return result
292
293 def genMenu(page,sitemap,slevel,elevel):
294     title = None
295     sm = []
296     if elevel == MAXLEVEL or elevel == 1 or page == None:
297         html = '<ul>\n'
298         sm = sitemap
299     else:
300         html = '<ul class="tree">\n'
301         idx = sitemap.index(page)
302         while (sitemap[idx]['level'] == page['level']):
303             idx = idx-1
304         title = sitemap[idx]['menu']
305         idx = idx+1
306         while (idx < len(sitemap) and sitemap[idx]['level'] == page['level']):
307             sm.append(sitemap[idx])
308             idx = idx+1
309     oldlevel = slevel
310
311     for p in sm:
312         if slevel > p['level'] or elevel < p['level']:
313             continue
314         if not title and p['link'] == '/':
315             title = p['menu']
316
317         if oldlevel < p['level']:
318             html+='<ul>\n'
319         elif oldlevel > p['level']:
320             if p['link'][-1] == '/':
321                 html+='</li>\n'
322             html+='</ul>\n</li>\n'
323         if page != None and page == p:
324             html+='<li class="selected"><a href="%s">%s</a>' % (p['link'],p['menu'])
325         else:
326             html+='<li><a href="%s">%s</a>' % (p['link'],p['menu'])
327         if p['link'][-1] != '/' or p['link'] == '/':
328             html+='</li>\n'
329         oldlevel = p['level']
330     html+='</ul>\n'
331     return (html,title)
332
333 def writeToTemplate(page,doc,sitemap):
334     (menu,menuname) = genMenu(page,sitemap,1,MAXLEVEL)
335     (levelmenu,levelname) = genMenu(page,sitemap,page['level'],page['level'])
336     template = Template(file=style_tmpl,
337                         searchList=[{'title':page['title']},
338                                     {'menu':menu},
339                                     {'article':doc},
340                                     {'levelmenu':levelmenu},
341                                     {'levelname':levelname}])
342     outfile = tmptarget+page['output']
343     mkdir_p(os.path.dirname(outfile))
344     out = open(outfile, 'w')
345     out.write(str(template))
346     out.close()
347     for r in page['res']:
348         mkdir_p(os.path.dirname(tmptarget+r))
349         shutil.copyfile(r, tmptarget+r)
350
351 def createSitemap(sitemap):
352     (menu,menuname) = genMenu(None,sitemap,1,MAXLEVEL)
353     template = Template(file=style_tmpl,
354                         searchList=[
355             {'title':'Sitemap'},
356             {'menu':menu},
357             {'article':menu},
358             {'levelmenu':''},
359             {'levelname':''}])
360     outfile = tmptarget+'sitemap.en.html'
361     mkdir_p(os.path.dirname(outfile))
362     out = open(outfile, 'w')
363     out.write(str(template))
364     out.close()
365
366 dir_ = Directory()
367 sitemap = Sitemap()
368
369 dir_.scan()
370 sitemap.read_map()
371
372 missing = dir_.set() - sitemap.set()
373 removed = sitemap.set() - dir_.set()
374 for page in removed:
375     print removed+' pages missing!!'
376
377 for page in missing:
378     print 'adding missing page '+page
379     sitemap.add_page(page)
380
381 sitemap.graph()
382
383
384 sitemap = generateSitemap()
385 tmptarget = tempfile.mkdtemp()+'/'
386 for page in sitemap:
387     t1 = time.time()
388     print "Page : %-30s %30s" % (page['link'],
389                         time.ctime(os.stat(page['file']).st_mtime)),
390     doc = expandXincludeTxt(page)
391     pubdoc = xsltConvert(doc)
392     writeToTemplate(page,pubdoc,sitemap)
393     t2 = time.time()
394     print "[%5.2f s]" % (round(t2-t1,2))
395
396 createSitemap(sitemap)
397 publish(tmptarget, args.output)
398 publish(args.style+"css", args.output)
399 publish(args.style+"images",args.output)