001 /* 002 * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021 package cascading.operation.xml; 022 023 import javax.xml.parsers.DocumentBuilder; 024 import javax.xml.xpath.XPathConstants; 025 import javax.xml.xpath.XPathExpressionException; 026 027 import cascading.flow.FlowProcess; 028 import cascading.operation.Function; 029 import cascading.operation.FunctionCall; 030 import cascading.operation.OperationException; 031 import cascading.tuple.Fields; 032 import cascading.tuple.Tuple; 033 import cascading.util.Pair; 034 import org.slf4j.Logger; 035 import org.slf4j.LoggerFactory; 036 import org.w3c.dom.Document; 037 import org.w3c.dom.NodeList; 038 039 /** 040 * XPathParser will extract a value from the passed Tuple argument into a new Tuple field. One field 041 * for every given XPath expression will be created. This function effectively converts an XML document into 042 * a table. 043 * <p/> 044 * If the returned value of the expression is a NodeList, only the first Node is used. The Node is converted to a new 045 * XML document and converted to a String. If only the text values are required, search on the text() nodes, or consider 046 * using {@link XPathGenerator} to handle multiple NodeList values. 047 */ 048 public class XPathParser extends XPathOperation implements Function<Pair<DocumentBuilder, Tuple>> 049 { 050 /** Field LOG */ 051 private static final Logger LOG = LoggerFactory.getLogger( XPathParser.class ); 052 053 /** 054 * Constructor XPathParser creates a new XPathParser instance. 055 * 056 * @param fieldDeclaration of type Fields 057 * @param namespaces of type String[][] 058 * @param paths of type String... 059 */ 060 public XPathParser( Fields fieldDeclaration, String[][] namespaces, String... paths ) 061 { 062 super( 1, fieldDeclaration, namespaces, paths ); 063 064 if( !fieldDeclaration.isSubstitution() && fieldDeclaration.size() != paths.length ) 065 throw new IllegalArgumentException( "declared fields and given xpath expressions are not the same size: " + fieldDeclaration.print() + " paths: " + paths.length ); 066 } 067 068 /** 069 * Constructor XPathParser creates a new XPathParser instance. 070 * 071 * @param fieldDeclaration of type Fields 072 * @param paths of type String... 073 */ 074 public XPathParser( Fields fieldDeclaration, String... paths ) 075 { 076 super( 1, fieldDeclaration, null, paths ); 077 078 if( !fieldDeclaration.isSubstitution() && fieldDeclaration.size() != paths.length ) 079 throw new IllegalArgumentException( "declared fields and given xpath expressions are not the same size: " + fieldDeclaration.print() + " paths: " + paths.length ); 080 } 081 082 @Override 083 public void operate( FlowProcess flowProcess, FunctionCall<Pair<DocumentBuilder, Tuple>> functionCall ) 084 { 085 Tuple tuple = functionCall.getContext().getRhs(); 086 087 tuple.clear(); 088 089 String argument = functionCall.getArguments().getString( 0 ); 090 Document document = parseDocument( functionCall.getContext().getLhs(), argument ); 091 092 for( int i = 0; i < getExpressions().size(); i++ ) 093 { 094 try 095 { 096 NodeList value = (NodeList) getExpressions().get( i ).evaluate( document, XPathConstants.NODESET ); 097 098 if( LOG.isDebugEnabled() ) 099 LOG.debug( "xpath: {} was: {}", paths[ i ], value != null && value.getLength() != 0 ); 100 101 if( value != null && value.getLength() != 0 ) 102 tuple.add( writeAsXML( value.item( 0 ) ) ); 103 else 104 tuple.add( "" ); 105 } 106 catch( XPathExpressionException exception ) 107 { 108 throw new OperationException( "could not evaluate xpath expression: " + paths[ i ], exception ); 109 } 110 } 111 112 functionCall.getOutputCollector().add( tuple ); 113 } 114 }