001    /*
002     * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
003     *
004     * Project and contact information: http://www.cascading.org/
005     *
006     * This file is part of the Cascading project.
007     *
008     * Licensed under the Apache License, Version 2.0 (the "License");
009     * you may not use this file except in compliance with the License.
010     * You may obtain a copy of the License at
011     *
012     *     http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing, software
015     * distributed under the License is distributed on an "AS IS" BASIS,
016     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017     * See the License for the specific language governing permissions and
018     * limitations under the License.
019     */
020    
021    package cascading.operation.xml;
022    
023    import java.beans.ConstructorProperties;
024    import java.io.IOException;
025    import java.io.StringReader;
026    import java.io.StringWriter;
027    import java.util.HashMap;
028    import java.util.Map;
029    
030    import cascading.flow.FlowProcess;
031    import cascading.operation.BaseOperation;
032    import cascading.operation.Function;
033    import cascading.operation.FunctionCall;
034    import cascading.tuple.Fields;
035    import cascading.tuple.Tuple;
036    import org.ccil.cowan.tagsoup.HTMLSchema;
037    import org.ccil.cowan.tagsoup.Parser;
038    import org.ccil.cowan.tagsoup.XMLWriter;
039    import org.slf4j.Logger;
040    import org.slf4j.LoggerFactory;
041    import org.xml.sax.InputSource;
042    import org.xml.sax.SAXException;
043    import org.xml.sax.SAXNotRecognizedException;
044    import org.xml.sax.SAXNotSupportedException;
045    
046    /**
047     * Class TagSoupParser uses the <a href="http://home.ccil.org/~cowan/XML/tagsoup/">Tag Soup</a> library to convert
048     * incoming HTML to clean XHTML.
049     */
050    public class TagSoupParser extends BaseOperation implements Function
051      {
052      /** Field LOG */
053      private static final Logger LOG = LoggerFactory.getLogger( TagSoupParser.class );
054    
055      /** Field features */
056      private Map<String, Boolean> features;
057      /** Field schema */
058      private transient HTMLSchema schema;
059      /** Field parser */
060      private transient Parser parser;
061    
062      /**
063       * Constructor TagSoupParser creates a new TagSoupParser instance.
064       *
065       * @param fieldDeclaration of type Fields
066       */
067      @ConstructorProperties({"fieldDeclaration"})
068      public TagSoupParser( Fields fieldDeclaration )
069        {
070        super( 1, fieldDeclaration );
071    
072        if( fieldDeclaration.size() != 1 )
073          throw new IllegalArgumentException( "fieldDeclaration may only declare one field name: " + fieldDeclaration.print() );
074        }
075    
076      private HTMLSchema getSchema()
077        {
078        if( schema == null )
079          schema = new HTMLSchema();
080    
081        return schema;
082        }
083    
084      private Parser getParser() throws SAXNotSupportedException, SAXNotRecognizedException
085        {
086        if( parser != null )
087          return parser;
088    
089        parser = new Parser();
090        parser.setProperty( Parser.schemaProperty, getSchema() );
091    
092        if( features != null )
093          {
094          for( Map.Entry<String, Boolean> entry : features.entrySet() )
095            parser.setFeature( entry.getKey(), entry.getValue() );
096          }
097    
098        return parser;
099        }
100    
101      /**
102       * Method setFeature allows the user to set 'features' directly on the TagSoup parser, {@link Parser#setFeature}.
103       * <p/>
104       * Note, all features are lazily added when the Parser is instantiated.
105       *
106       * @param feature of type String
107       * @param value   of type boolean
108       */
109      public void setFeature( String feature, boolean value )
110        {
111        if( features == null )
112          features = new HashMap<String, Boolean>();
113    
114        features.put( feature, value );
115        }
116    
117      /** @see cascading.operation.Function#operate(cascading.flow.FlowProcess, cascading.operation.FunctionCall) */
118      public void operate( FlowProcess flowProcess, FunctionCall functionCall )
119        {
120        try
121          {
122          StringWriter writer = new StringWriter();
123          XMLWriter xmlWriter = new XMLWriter( writer );
124    
125          xmlWriter.setPrefix( getSchema().getURI(), "" );
126          xmlWriter.setOutputProperty( XMLWriter.OMIT_XML_DECLARATION, "yes" );
127    
128          InputSource source = new InputSource( new StringReader( (String) functionCall.getArguments().getObject( 0 ) ) );
129    
130          getParser().setContentHandler( xmlWriter );
131    
132          getParser().parse( source );
133    
134          functionCall.getOutputCollector().add( new Tuple( writer.getBuffer().toString() ) );
135          }
136        catch( SAXNotRecognizedException exception )
137          {
138          LOG.warn( "ignoring TagSoup exception", exception );
139          }
140        catch( SAXNotSupportedException exception )
141          {
142          LOG.warn( "ignoring TagSoup exception", exception );
143          }
144        catch( IOException exception )
145          {
146          LOG.warn( "ignoring TagSoup exception", exception );
147          }
148        catch( SAXException exception )
149          {
150          LOG.warn( "ignoring TagSoup exception", exception );
151          }
152        }
153    
154      @Override
155      public boolean equals( Object object )
156        {
157        if( this == object )
158          return true;
159        if( !( object instanceof TagSoupParser ) )
160          return false;
161        if( !super.equals( object ) )
162          return false;
163    
164        TagSoupParser that = (TagSoupParser) object;
165    
166        if( features != null ? !features.equals( that.features ) : that.features != null )
167          return false;
168    
169        return true;
170        }
171    
172      @Override
173      public int hashCode()
174        {
175        int result = super.hashCode();
176        result = 31 * result + ( features != null ? features.hashCode() : 0 );
177        return result;
178        }
179      }