vendredi 8 mai 2015

Iterate through xml and html with BeautifulSoup

Currently I'm trying to use BeautifulSoup to iterate over a xml file, trying to extract these kind of lines:

<Cell ss:StyleID="s(26|27|39)"><Data ss:Type="String">Textcontent</Data></Cell>

Extracting them works mostly, but I don't know why I'm getting more than 250 results when I should be getting only 1.

My current script:

#!/usr/local/bin/python
# coding: utf-8

import sys

from HTMLParser import HTMLParser
from bs4 import BeautifulSoup as bs

class ParseExcelXMLFile(object):

    soup = None
    substituteTagDict = {
        "ul": "",
        "b": "*",
        "li": "\n * ",
    }
    _file = None
    _index = []

    def __init__(self, file):
        self._file = file
        self.soup = bs(open(file, 'r+').read())
        self.iterateXML(self.soup)

    def iterateXML(self, node):
        if hasattr(node, 'name') and node.name == u'data':
            self.parseHtml(node)
        if 'findChildren' in dir(node):
            for child in node.findChildren():
                self.iterateXML(child)

    def iterateHtml(self, node):
        processed_html = ""
        for child in node.findChildren():
            if hasattr(child, 'name'):
                processed_html += str(self.substituteHtmlTag(child))
            if hasattr(child, 'findChildren'):
                processed_html += self.iterateHtml(child)
        return processed_html

    def substituteHtmlTag(self, node):
        if node.name in self.substituteTagDict:
            node.name = self.substituteTagDict[node.name]
            # index the other attributes of the tag and save it in another file, while remark to index remains like [#431]
            # not sure how I can access them without knowing the identifier of them(name, id, ...), similar to:
            # if not node.attributes in self._index:
            #   self._index.append(node.attributes)
            # node.attributes = "[#%i]" % (self._index.index(node.attributes))
        return node.prettify()

    def parseHtml(self, node):
        parsed_html = ""
        html = HTMLParser.unescape.__func__(HTMLParser, node.text)
        if '<ul class="bodytextAttributes">' in html:
            soup = bs(html)
            for bodytext in soup.find_all('ul', {'class' : 'bodytextAttributes'}):
                parsed_html += self.iterateHtml(bodytext)
        return parsed_html

    def saveXML(self, file=_file):
        open(file, 'w+').write(str(self.soup.prettify()))       

if __name__ == '__main__':
    parser = ParseExcelXMLFile(sys.argv[1])

Since I failed already by iterating over the xml elements the html part is written freely without testing sadly. For the test case I've rewritten the structure of the xml file:

<?xml version="1.0"?>
<Workbook xmlns="urn:schemas-microsoft-com:office:spreadsheet"
          xmlns:o="urn:schemas-microsoft-com:office:office"
          xmlns:x="urn:schemas-microsoft-com:office:excel"
          xmlns:html="http://ift.tt/qQdaDR"
          xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet">
    <DocumentProperties xmlns="urn:schemas-microsoft-com:office:office">
        <Author>Test Case</Author>
    </DocumentProperties>
    <OfficeDocumentSettings xmlns="urn:schemas-microsoft-com:office:office">
        <AllowPNG/>
    </OfficeDocumentSettings>
    <Styles>
        <Style ss:ID="s27">
            <Alignment ss:Vertical="Top" ss:WrapText="1"/>
        </Style>
        <Style ss:ID="s35">
            <Font ss:FontName="Verdana" ss:Size="18.0" ss:Bold="1"/>
            <Interior ss:Color="#969696" ss:Pattern="Solid"/>
        </Style>
    </Styles>
    <Worksheet ss:Name="Translation">
        <Table ss:ExpandedColumnCount="6" ss:ExpandedRowCount="5" x:FullColumns="1" x:FullRows="1">
            <Column ss:AutoFitWidth="0" ss:Width="151.0"/>

            <!-- Page header -->
           <Row>
            <Cell ss:Index="2" ss:StyleID="s35"><Data ss:Type="String">TestString [1]</Data></Cell>
            <Cell ss:StyleID="s35"></Cell>
            <Cell ss:StyleID="s35"></Cell>
            <Cell ss:StyleID="s35"></Cell>

           </Row>
            <!-- Field list header -->
           <Row>
            <Cell ss:Index="2" ss:StyleID="s38"><Data ss:Type="String">Fieldname:</Data></Cell>
            <Cell ss:StyleID="s38"><Data ss:Type="String">Source language:</Data></Cell>
            <Cell ss:StyleID="s38"><Data ss:Type="String">Alternative source language:</Data></Cell>
            <Cell ss:StyleID="s38"><Data ss:Type="String">Translation:</Data></Cell>
            <Cell ss:StyleID="s38"><Data ss:Type="String">Difference since last tr.:</Data></Cell>
           </Row>
                            <!-- Element header -->
                           <Row>
                            <Cell ss:Index="2" ss:StyleID="s37"><Data ss:Type="String">Element: pages:1</Data></Cell>
                            <Cell ss:StyleID="s37"></Cell>
                            <Cell ss:StyleID="s37"></Cell>
                            <Cell ss:StyleID="s37"></Cell>

                           </Row>

                                <!-- Translation row: -->
                                   <Row ss:StyleID="s25">
                                    <Cell><Data ss:Type="String">translation[pages][1][pages_language_overlay:130:title]</Data></Cell>
                                    <Cell ss:StyleID="s26"><Data ss:Type="String">title</Data></Cell>
                                    <Cell ss:StyleID="s27"><Data ss:Type="String">&lt;ul class=&quot;bodytextAttributes&quot;&gt;    &lt;li class=&quot;field&quot;&gt;        &lt;b&gt;bold:&lt;/b&gt;        &lt;span class=&quot;exampleClass&quot;&gt;1234567890&lt;/span&gt;    &lt;/li&gt;    &lt;li class=&quot;field&quot;&gt;    &lt;b&gt;bold:&lt;/b&gt;        &lt;div class=&quot;exampleClass2&quot;&gt;        &lt;ul class=&quot;exampleClass3&quot;&gt;            &lt;li style=&quot;padding-left: 10px;&quot;&gt;Text1&lt;/li&gt;        &lt;/ul&gt;        &lt;ul class=&quot;exampleClass3&quot;&gt;            &lt;li style=&quot;padding-left: 10px;&quot;&gt;Text2&lt;/li&gt;        &lt;/ul&gt;        &lt;ul class=&quot;exampleClass3&quot;&gt;            &lt;li style=&quot;padding-left: 10px;&quot;&gt;Text3&lt;/li&gt;        &lt;/ul&gt;        &lt;ul class=&quot;exampleClass3&quot;&gt;            &lt;li style=&quot;padding-left: 10px;&quot;&gt;Text4&lt;/li&gt;        &lt;/ul&gt;        &lt;/div&gt;    &lt;/li&gt;&lt;/ul&gt;</Data></Cell>
                                    <Cell ss:StyleID="s27"><Data ss:Type="String"></Data></Cell>
                                    <Cell ss:StyleID="s39"><Data ss:Type="String">&lt;ul class=&quot;bodytextAttributes&quot;&gt;    &lt;li class=&quot;field&quot;&gt;        &lt;b&gt;bold:&lt;/b&gt;        &lt;span class=&quot;exampleClass&quot;&gt;1234567890&lt;/span&gt;    &lt;/li&gt;    &lt;li class=&quot;field&quot;&gt;    &lt;b&gt;bold:&lt;/b&gt;        &lt;div class=&quot;exampleClass2&quot;&gt;        &lt;ul class=&quot;exampleClass3&quot;&gt;            &lt;li style=&quot;padding-left: 10px;&quot;&gt;Text1&lt;/li&gt;        &lt;/ul&gt;        &lt;ul class=&quot;exampleClass3&quot;&gt;            &lt;li style=&quot;padding-left: 10px;&quot;&gt;Text2&lt;/li&gt;        &lt;/ul&gt;        &lt;ul class=&quot;exampleClass3&quot;&gt;            &lt;li style=&quot;padding-left: 10px;&quot;&gt;Text3&lt;/li&gt;        &lt;/ul&gt;        &lt;ul class=&quot;exampleClass3&quot;&gt;            &lt;li style=&quot;padding-left: 10px;&quot;&gt;Text4&lt;/li&gt;        &lt;/ul&gt;        &lt;/div&gt;    &lt;/li&gt;&lt;/ul&gt;</Data></Cell>
                                    <Cell ss:StyleID="s27"><Data ss:Type="String">[No change]</Data></Cell>
                                   </Row>    
                <!-- Spacer row -->
               <Row>
                <Cell ss:Index="2"><Data ss:Type="String"></Data></Cell>
               </Row>

        </Table>
        <WorksheetOptions xmlns="urn:schemas-microsoft-com:office:excel">
            <PageSetup>
                <Layout x:Orientation="Landscape"/>
            </PageSetup>
        </WorksheetOptions>
    </Worksheet>
    <Worksheet ss:Name="Information">
        <WorksheetOptions xmlns="urn:schemas-microsoft-com:office:excel">
            <PageSetup>
                <Layout x:Orientation="Landscape"/>
            </PageSetup>
        </WorksheetOptions>
    </Worksheet>
</Workbook>

any hints why I got so many results in parsing the excel xml with my script and how I could get the amount I really need? I feel like I'm getting the children twice and threefold, how can I prevent that? I should be able to index and substitute the tags, but iterating through the xml/html is really not working for me atm(with the current script/xml I get 256 results, which should be 1).

Aucun commentaire:

Enregistrer un commentaire