001/* 002 * Copyright (c) 2007-2016 Concurrent, Inc. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading.operation.xml; 022 023import java.beans.ConstructorProperties; 024import java.io.IOException; 025import java.io.StringReader; 026import java.io.StringWriter; 027import java.util.HashMap; 028import java.util.Map; 029 030import cascading.flow.FlowProcess; 031import cascading.operation.BaseOperation; 032import cascading.operation.Function; 033import cascading.operation.FunctionCall; 034import cascading.tuple.Fields; 035import cascading.tuple.Tuple; 036import org.ccil.cowan.tagsoup.HTMLSchema; 037import org.ccil.cowan.tagsoup.Parser; 038import org.ccil.cowan.tagsoup.XMLWriter; 039import org.slf4j.Logger; 040import org.slf4j.LoggerFactory; 041import org.xml.sax.InputSource; 042import org.xml.sax.SAXException; 043import org.xml.sax.SAXNotRecognizedException; 044import org.xml.sax.SAXNotSupportedException; 045 046/** 047 * Class TagSoupParser uses the <a href="http://home.ccil.org/~cowan/XML/tagsoup/">Tag Soup</a> library to convert 048 * incoming HTML to clean XHTML. 049 */ 050public class TagSoupParser extends BaseOperation implements Function 051 { 052 /** Field LOG */ 053 private static final Logger LOG = LoggerFactory.getLogger( TagSoupParser.class ); 054 055 /** Field features */ 056 private Map<String, Boolean> features; 057 /** Field schema */ 058 private transient HTMLSchema schema; 059 /** Field parser */ 060 private transient Parser parser; 061 062 /** 063 * Constructor TagSoupParser creates a new TagSoupParser instance. 064 * 065 * @param fieldDeclaration of type Fields 066 */ 067 @ConstructorProperties({"fieldDeclaration"}) 068 public TagSoupParser( Fields fieldDeclaration ) 069 { 070 super( 1, fieldDeclaration ); 071 072 if( fieldDeclaration.size() != 1 ) 073 throw new IllegalArgumentException( "fieldDeclaration may only declare one field name: " + fieldDeclaration.print() ); 074 } 075 076 private HTMLSchema getSchema() 077 { 078 if( schema == null ) 079 schema = new HTMLSchema(); 080 081 return schema; 082 } 083 084 private Parser getParser() throws SAXNotSupportedException, SAXNotRecognizedException 085 { 086 if( parser != null ) 087 return parser; 088 089 parser = new Parser(); 090 parser.setProperty( Parser.schemaProperty, getSchema() ); 091 092 if( features != null ) 093 { 094 for( Map.Entry<String, Boolean> entry : features.entrySet() ) 095 parser.setFeature( entry.getKey(), entry.getValue() ); 096 } 097 098 return parser; 099 } 100 101 /** 102 * Method setFeature allows the user to set 'features' directly on the TagSoup parser, {@link Parser#setFeature}. 103 * <p/> 104 * Note, all features are lazily added when the Parser is instantiated. 105 * 106 * @param feature of type String 107 * @param value of type boolean 108 */ 109 public void setFeature( String feature, boolean value ) 110 { 111 if( features == null ) 112 features = new HashMap<String, Boolean>(); 113 114 features.put( feature, value ); 115 } 116 117 /** @see cascading.operation.Function#operate(cascading.flow.FlowProcess, cascading.operation.FunctionCall) */ 118 public void operate( FlowProcess flowProcess, FunctionCall functionCall ) 119 { 120 try 121 { 122 StringWriter writer = new StringWriter(); 123 XMLWriter xmlWriter = new XMLWriter( writer ); 124 125 xmlWriter.setPrefix( getSchema().getURI(), "" ); 126 xmlWriter.setOutputProperty( XMLWriter.OMIT_XML_DECLARATION, "yes" ); 127 128 InputSource source = new InputSource( new StringReader( (String) functionCall.getArguments().getObject( 0 ) ) ); 129 130 getParser().setContentHandler( xmlWriter ); 131 132 getParser().parse( source ); 133 134 functionCall.getOutputCollector().add( new Tuple( writer.getBuffer().toString() ) ); 135 } 136 catch( SAXNotRecognizedException exception ) 137 { 138 LOG.warn( "ignoring TagSoup exception", exception ); 139 } 140 catch( SAXNotSupportedException exception ) 141 { 142 LOG.warn( "ignoring TagSoup exception", exception ); 143 } 144 catch( IOException exception ) 145 { 146 LOG.warn( "ignoring TagSoup exception", exception ); 147 } 148 catch( SAXException exception ) 149 { 150 LOG.warn( "ignoring TagSoup exception", exception ); 151 } 152 } 153 154 @Override 155 public boolean equals( Object object ) 156 { 157 if( this == object ) 158 return true; 159 if( !( object instanceof TagSoupParser ) ) 160 return false; 161 if( !super.equals( object ) ) 162 return false; 163 164 TagSoupParser that = (TagSoupParser) object; 165 166 if( features != null ? !features.equals( that.features ) : that.features != null ) 167 return false; 168 169 return true; 170 } 171 172 @Override 173 public int hashCode() 174 { 175 int result = super.hashCode(); 176 result = 31 * result + ( features != null ? features.hashCode() : 0 ); 177 return result; 178 } 179 }