Currently I'm trying to use BeautifulSoup to iterate over a xml file, trying to extract these kind of lines:
<Cell ss:StyleID="s(26|27|39)"><Data ss:Type="String">Textcontent</Data></Cell>
Extracting them works mostly, but I don't know why I'm getting more than 250 results when I should be getting only 1.
My current script:
#!/usr/local/bin/python
# coding: utf-8
import sys
from HTMLParser import HTMLParser
from bs4 import BeautifulSoup as bs
class ParseExcelXMLFile(object):
soup = None
substituteTagDict = {
"ul": "",
"b": "*",
"li": "\n * ",
}
_file = None
_index = []
def __init__(self, file):
self._file = file
self.soup = bs(open(file, 'r+').read())
self.iterateXML(self.soup)
def iterateXML(self, node):
if hasattr(node, 'name') and node.name == u'data':
self.parseHtml(node)
if 'findChildren' in dir(node):
for child in node.findChildren():
self.iterateXML(child)
def iterateHtml(self, node):
processed_html = ""
for child in node.findChildren():
if hasattr(child, 'name'):
processed_html += str(self.substituteHtmlTag(child))
if hasattr(child, 'findChildren'):
processed_html += self.iterateHtml(child)
return processed_html
def substituteHtmlTag(self, node):
if node.name in self.substituteTagDict:
node.name = self.substituteTagDict[node.name]
# index the other attributes of the tag and save it in another file, while remark to index remains like [#431]
# not sure how I can access them without knowing the identifier of them(name, id, ...), similar to:
# if not node.attributes in self._index:
# self._index.append(node.attributes)
# node.attributes = "[#%i]" % (self._index.index(node.attributes))
return node.prettify()
def parseHtml(self, node):
parsed_html = ""
html = HTMLParser.unescape.__func__(HTMLParser, node.text)
if '<ul class="bodytextAttributes">' in html:
soup = bs(html)
for bodytext in soup.find_all('ul', {'class' : 'bodytextAttributes'}):
parsed_html += self.iterateHtml(bodytext)
return parsed_html
def saveXML(self, file=_file):
open(file, 'w+').write(str(self.soup.prettify()))
if __name__ == '__main__':
parser = ParseExcelXMLFile(sys.argv[1])
Since I failed already by iterating over the xml elements the html part is written freely without testing sadly. For the test case I've rewritten the structure of the xml file:
<?xml version="1.0"?>
<Workbook xmlns="urn:schemas-microsoft-com:office:spreadsheet"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:x="urn:schemas-microsoft-com:office:excel"
xmlns:html="http://ift.tt/qQdaDR"
xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet">
<DocumentProperties xmlns="urn:schemas-microsoft-com:office:office">
<Author>Test Case</Author>
</DocumentProperties>
<OfficeDocumentSettings xmlns="urn:schemas-microsoft-com:office:office">
<AllowPNG/>
</OfficeDocumentSettings>
<Styles>
<Style ss:ID="s27">
<Alignment ss:Vertical="Top" ss:WrapText="1"/>
</Style>
<Style ss:ID="s35">
<Font ss:FontName="Verdana" ss:Size="18.0" ss:Bold="1"/>
<Interior ss:Color="#969696" ss:Pattern="Solid"/>
</Style>
</Styles>
<Worksheet ss:Name="Translation">
<Table ss:ExpandedColumnCount="6" ss:ExpandedRowCount="5" x:FullColumns="1" x:FullRows="1">
<Column ss:AutoFitWidth="0" ss:Width="151.0"/>
<!-- Page header -->
<Row>
<Cell ss:Index="2" ss:StyleID="s35"><Data ss:Type="String">TestString [1]</Data></Cell>
<Cell ss:StyleID="s35"></Cell>
<Cell ss:StyleID="s35"></Cell>
<Cell ss:StyleID="s35"></Cell>
</Row>
<!-- Field list header -->
<Row>
<Cell ss:Index="2" ss:StyleID="s38"><Data ss:Type="String">Fieldname:</Data></Cell>
<Cell ss:StyleID="s38"><Data ss:Type="String">Source language:</Data></Cell>
<Cell ss:StyleID="s38"><Data ss:Type="String">Alternative source language:</Data></Cell>
<Cell ss:StyleID="s38"><Data ss:Type="String">Translation:</Data></Cell>
<Cell ss:StyleID="s38"><Data ss:Type="String">Difference since last tr.:</Data></Cell>
</Row>
<!-- Element header -->
<Row>
<Cell ss:Index="2" ss:StyleID="s37"><Data ss:Type="String">Element: pages:1</Data></Cell>
<Cell ss:StyleID="s37"></Cell>
<Cell ss:StyleID="s37"></Cell>
<Cell ss:StyleID="s37"></Cell>
</Row>
<!-- Translation row: -->
<Row ss:StyleID="s25">
<Cell><Data ss:Type="String">translation[pages][1][pages_language_overlay:130:title]</Data></Cell>
<Cell ss:StyleID="s26"><Data ss:Type="String">title</Data></Cell>
<Cell ss:StyleID="s27"><Data ss:Type="String"><ul class="bodytextAttributes"> <li class="field"> <b>bold:</b> <span class="exampleClass">1234567890</span> </li> <li class="field"> <b>bold:</b> <div class="exampleClass2"> <ul class="exampleClass3"> <li style="padding-left: 10px;">Text1</li> </ul> <ul class="exampleClass3"> <li style="padding-left: 10px;">Text2</li> </ul> <ul class="exampleClass3"> <li style="padding-left: 10px;">Text3</li> </ul> <ul class="exampleClass3"> <li style="padding-left: 10px;">Text4</li> </ul> </div> </li></ul></Data></Cell>
<Cell ss:StyleID="s27"><Data ss:Type="String"></Data></Cell>
<Cell ss:StyleID="s39"><Data ss:Type="String"><ul class="bodytextAttributes"> <li class="field"> <b>bold:</b> <span class="exampleClass">1234567890</span> </li> <li class="field"> <b>bold:</b> <div class="exampleClass2"> <ul class="exampleClass3"> <li style="padding-left: 10px;">Text1</li> </ul> <ul class="exampleClass3"> <li style="padding-left: 10px;">Text2</li> </ul> <ul class="exampleClass3"> <li style="padding-left: 10px;">Text3</li> </ul> <ul class="exampleClass3"> <li style="padding-left: 10px;">Text4</li> </ul> </div> </li></ul></Data></Cell>
<Cell ss:StyleID="s27"><Data ss:Type="String">[No change]</Data></Cell>
</Row>
<!-- Spacer row -->
<Row>
<Cell ss:Index="2"><Data ss:Type="String"></Data></Cell>
</Row>
</Table>
<WorksheetOptions xmlns="urn:schemas-microsoft-com:office:excel">
<PageSetup>
<Layout x:Orientation="Landscape"/>
</PageSetup>
</WorksheetOptions>
</Worksheet>
<Worksheet ss:Name="Information">
<WorksheetOptions xmlns="urn:schemas-microsoft-com:office:excel">
<PageSetup>
<Layout x:Orientation="Landscape"/>
</PageSetup>
</WorksheetOptions>
</Worksheet>
</Workbook>
any hints why I got so many results in parsing the excel xml with my script and how I could get the amount I really need? I feel like I'm getting the children twice and threefold, how can I prevent that? I should be able to index and substitute the tags, but iterating through the xml/html is really not working for me atm(with the current script/xml I get 256 results, which should be 1).
Aucun commentaire:
Enregistrer un commentaire