001    /*
002     * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
003     *
004     * Project and contact information: http://www.cascading.org/
005     *
006     * This file is part of the Cascading project.
007     *
008     * Licensed under the Apache License, Version 2.0 (the "License");
009     * you may not use this file except in compliance with the License.
010     * You may obtain a copy of the License at
011     *
012     *     http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing, software
015     * distributed under the License is distributed on an "AS IS" BASIS,
016     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017     * See the License for the specific language governing permissions and
018     * limitations under the License.
019     */
020    
021    package cascading.operation.regex;
022    
023    import java.beans.ConstructorProperties;
024    import java.util.regex.Pattern;
025    
026    import cascading.flow.FlowProcess;
027    import cascading.operation.Function;
028    import cascading.operation.FunctionCall;
029    import cascading.operation.OperationCall;
030    import cascading.tuple.Fields;
031    import cascading.tuple.Tuple;
032    import cascading.util.Pair;
033    
034    /**
035     * Class RegexGenerator will emit a new Tuple for every split on the incoming argument value delimited by the given patternString.
036     * <p/>
037     * RegexGenerator only expects one field value. If more than one argument value is passed, only the
038     * first is handled, the remainder are ignored.
039     * <p/>
040     * This could be used to break a document into single word tuples for later processing for a word count.
041     * <p/>
042     * Note a {@code null} valued argument passed to the parser will be converted to an empty string ({@code ""}) before
043     * the regex is applied.
044     * <p/>
045     * Any Object value will be coerced to a String type if type information is provided. See the
046     * {@link cascading.tuple.type.CoercibleType} interface to control how custom Object types are converted to String
047     * values.
048     */
049    public class RegexSplitGenerator extends RegexOperation<Pair<Pattern, Tuple>> implements Function<Pair<Pattern, Tuple>>
050      {
051      /**
052       * Constructor RegexGenerator creates a new RegexGenerator instance.
053       *
054       * @param patternString of type String
055       */
056      @ConstructorProperties({"patternString"})
057      public RegexSplitGenerator( String patternString )
058        {
059        super( 1, Fields.size( 1 ), patternString );
060        }
061    
062      /**
063       * Constructor RegexGenerator creates a new RegexGenerator instance.
064       *
065       * @param fieldDeclaration of type Fields
066       * @param patternString    of type String
067       */
068      @ConstructorProperties({"fieldDeclaration", "patternString"})
069      public RegexSplitGenerator( Fields fieldDeclaration, String patternString )
070        {
071        super( 1, fieldDeclaration, patternString );
072    
073        if( fieldDeclaration.size() != 1 )
074          throw new IllegalArgumentException( "fieldDeclaration may only declare one field, was " + fieldDeclaration.print() );
075        }
076    
077      @Override
078      public void prepare( FlowProcess flowProcess, OperationCall<Pair<Pattern, Tuple>> operationCall )
079        {
080        operationCall.setContext( new Pair<Pattern, Tuple>( getPattern(), Tuple.size( 1 ) ) );
081        }
082    
083      @Override
084      public void operate( FlowProcess flowProcess, FunctionCall<Pair<Pattern, Tuple>> functionCall )
085        {
086        String value = functionCall.getArguments().getString( 0 );
087    
088        if( value == null )
089          value = "";
090    
091        String[] split = functionCall.getContext().getLhs().split( value );
092    
093        for( String string : split )
094          {
095          functionCall.getContext().getRhs().set( 0, string );
096          functionCall.getOutputCollector().add( functionCall.getContext().getRhs() );
097          }
098        }
099      }