001    /*
002     * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
003     *
004     * Project and contact information: http://www.cascading.org/
005     *
006     * This file is part of the Cascading project.
007     *
008     * Licensed under the Apache License, Version 2.0 (the "License");
009     * you may not use this file except in compliance with the License.
010     * You may obtain a copy of the License at
011     *
012     *     http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing, software
015     * distributed under the License is distributed on an "AS IS" BASIS,
016     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017     * See the License for the specific language governing permissions and
018     * limitations under the License.
019     */
020    
021    package cascading.operation.xml;
022    
023    import javax.xml.parsers.DocumentBuilder;
024    import javax.xml.xpath.XPathConstants;
025    import javax.xml.xpath.XPathExpressionException;
026    
027    import cascading.flow.FlowProcess;
028    import cascading.operation.Function;
029    import cascading.operation.FunctionCall;
030    import cascading.operation.OperationException;
031    import cascading.tuple.Fields;
032    import cascading.tuple.Tuple;
033    import cascading.util.Pair;
034    import org.slf4j.Logger;
035    import org.slf4j.LoggerFactory;
036    import org.w3c.dom.Document;
037    import org.w3c.dom.NodeList;
038    
039    /**
040     * XPathParser will extract a value from the passed Tuple argument into a new Tuple field. One field
041     * for every given XPath expression will be created. This function effectively converts an XML document into
042     * a table.
043     * <p/>
044     * If the returned value of the expression is a NodeList, only the first Node is used. The Node is converted to a new
045     * XML document and converted to a String. If only the text values are required, search on the text() nodes, or consider
046     * using {@link XPathGenerator} to handle multiple NodeList values.
047     */
048    public class XPathParser extends XPathOperation implements Function<Pair<DocumentBuilder, Tuple>>
049      {
050      /** Field LOG */
051      private static final Logger LOG = LoggerFactory.getLogger( XPathParser.class );
052    
053      /**
054       * Constructor XPathParser creates a new XPathParser instance.
055       *
056       * @param fieldDeclaration of type Fields
057       * @param namespaces       of type String[][]
058       * @param paths            of type String...
059       */
060      public XPathParser( Fields fieldDeclaration, String[][] namespaces, String... paths )
061        {
062        super( 1, fieldDeclaration, namespaces, paths );
063    
064        if( !fieldDeclaration.isSubstitution() && fieldDeclaration.size() != paths.length )
065          throw new IllegalArgumentException( "declared fields and given xpath expressions are not the same size: " + fieldDeclaration.print() + " paths: " + paths.length );
066        }
067    
068      /**
069       * Constructor XPathParser creates a new XPathParser instance.
070       *
071       * @param fieldDeclaration of type Fields
072       * @param paths            of type String...
073       */
074      public XPathParser( Fields fieldDeclaration, String... paths )
075        {
076        super( 1, fieldDeclaration, null, paths );
077    
078        if( !fieldDeclaration.isSubstitution() && fieldDeclaration.size() != paths.length )
079          throw new IllegalArgumentException( "declared fields and given xpath expressions are not the same size: " + fieldDeclaration.print() + " paths: " + paths.length );
080        }
081    
082      @Override
083      public void operate( FlowProcess flowProcess, FunctionCall<Pair<DocumentBuilder, Tuple>> functionCall )
084        {
085        Tuple tuple = functionCall.getContext().getRhs();
086    
087        tuple.clear();
088    
089        String argument = functionCall.getArguments().getString( 0 );
090        Document document = parseDocument( functionCall.getContext().getLhs(), argument );
091    
092        for( int i = 0; i < getExpressions().size(); i++ )
093          {
094          try
095            {
096            NodeList value = (NodeList) getExpressions().get( i ).evaluate( document, XPathConstants.NODESET );
097    
098            if( LOG.isDebugEnabled() )
099              LOG.debug( "xpath: {} was: {}", paths[ i ], value != null && value.getLength() != 0 );
100    
101            if( value != null && value.getLength() != 0 )
102              tuple.add( writeAsXML( value.item( 0 ) ) );
103            else
104              tuple.add( "" );
105            }
106          catch( XPathExpressionException exception )
107            {
108            throw new OperationException( "could not evaluate xpath expression: " + paths[ i ], exception );
109            }
110          }
111    
112        functionCall.getOutputCollector().add( tuple );
113        }
114      }