001    /*
002     * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
003     *
004     * Project and contact information: http://www.cascading.org/
005     *
006     * This file is part of the Cascading project.
007     *
008     * Licensed under the Apache License, Version 2.0 (the "License");
009     * you may not use this file except in compliance with the License.
010     * You may obtain a copy of the License at
011     *
012     *     http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing, software
015     * distributed under the License is distributed on an "AS IS" BASIS,
016     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017     * See the License for the specific language governing permissions and
018     * limitations under the License.
019     */
020    
021    package cascading.operation.regex;
022    
023    import java.beans.ConstructorProperties;
024    import java.util.regex.Pattern;
025    
026    import cascading.flow.FlowProcess;
027    import cascading.operation.Function;
028    import cascading.operation.FunctionCall;
029    import cascading.operation.OperationCall;
030    import cascading.tuple.Fields;
031    import cascading.tuple.Tuple;
032    import cascading.util.Pair;
033    
034    /**
035     * Class RegexSplitter will split an incoming argument value by the given regex delimiter patternString.
036     * <p/>
037     * RegexSplitter only expects one field value. If more than one argument value is passed, only the
038     * first is handled, the remainder are ignored.
039     * <p/>
040     * Note a {@code null} valued argument passed to the parser will be converted to an empty string ({@code ""}) before
041     * the regex is applied.
042     * <p/>
043     * Any Object value will be coerced to a String type if type information is provided. See the
044     * {@link cascading.tuple.type.CoercibleType} interface to control how custom Object types are converted to String
045     * values.
046     */
047    public class RegexSplitter extends RegexOperation<Pair<Pattern, Tuple>> implements Function<Pair<Pattern, Tuple>>
048      {
049      private final int length;
050    
051      /**
052       * Constructor RegexSplitter creates a new RegexSplitter instance.
053       *
054       * @param patternString of type String
055       */
056      @ConstructorProperties({"patternString"})
057      public RegexSplitter( String patternString )
058        {
059        super( 1, patternString );
060        length = fieldDeclaration.isUnknown() ? -1 : fieldDeclaration.size();
061        }
062    
063      /**
064       * Constructor RegexOperation creates a new RegexOperation instance, where the delimiter is the tab character.
065       *
066       * @param fieldDeclaration of type Fields
067       */
068      @ConstructorProperties({"fieldDeclaration"})
069      public RegexSplitter( Fields fieldDeclaration )
070        {
071        super( 1, fieldDeclaration, "\t" );
072        length = fieldDeclaration.isUnknown() ? -1 : fieldDeclaration.size();
073        }
074    
075      /**
076       * Constructor RegexSplitter creates a new RegexSplitter instance.
077       *
078       * @param fieldDeclaration of type Fields
079       * @param patternString    of type String
080       */
081      @ConstructorProperties({"fieldDeclaration", "patternString"})
082      public RegexSplitter( Fields fieldDeclaration, String patternString )
083        {
084        super( 1, fieldDeclaration, patternString );
085        length = fieldDeclaration.isUnknown() ? -1 : fieldDeclaration.size();
086        }
087    
088      @Override
089      public void prepare( FlowProcess flowProcess, OperationCall<Pair<Pattern, Tuple>> operationCall )
090        {
091        operationCall.setContext( new Pair<Pattern, Tuple>( getPattern(), new Tuple() ) );
092        }
093    
094      @Override
095      public void operate( FlowProcess flowProcess, FunctionCall<Pair<Pattern, Tuple>> functionCall )
096        {
097        String value = functionCall.getArguments().getString( 0 );
098    
099        if( value == null )
100          value = "";
101    
102        Tuple output = functionCall.getContext().getRhs();
103    
104        output.clear();
105    
106        String[] split = functionCall.getContext().getLhs().split( value, length );
107    
108        for( int i = 0; i < split.length; i++ )
109          output.add( split[ i ] );
110    
111        functionCall.getOutputCollector().add( output );
112        }
113      }