001/*
002 * Copyright (c) 2016-2017 Chris K Wensel. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.operation.regex;
023
024import java.beans.ConstructorProperties;
025import java.util.regex.Pattern;
026
027import cascading.flow.FlowProcess;
028import cascading.operation.Function;
029import cascading.operation.FunctionCall;
030import cascading.operation.OperationCall;
031import cascading.tuple.Fields;
032import cascading.tuple.Tuple;
033import cascading.tuple.TupleEntry;
034import cascading.util.Pair;
035
036/**
037 * Class RegexSplitter will split an incoming argument value by the given regex delimiter patternString.
038 * <p>
039 * RegexSplitter only expects one field value. If more than one argument value is passed, only the
040 * first is handled, the remainder are ignored.
041 * <p>
042 * Note a {@code null} valued argument passed to the parser will be converted to an empty string ({@code ""}) before
043 * the regex is applied.
044 * <p>
045 * Any Object value will be coerced to a String type if type information is provided. See the
046 * {@link cascading.tuple.type.CoercibleType} interface to control how custom Object types are converted to String
047 * values.
048 */
049public class RegexSplitter extends RegexOperation<Pair<Pattern, TupleEntry>> implements Function<Pair<Pattern, TupleEntry>>
050  {
051  private int length;
052
053  /**
054   * Constructor RegexSplitter creates a new RegexSplitter instance.
055   *
056   * @param patternString of type String
057   */
058  @ConstructorProperties({"patternString"})
059  public RegexSplitter( String patternString )
060    {
061    super( 1, patternString );
062    length = fieldDeclaration.isUnknown() ? -1 : fieldDeclaration.size();
063    }
064
065  /**
066   * Constructor RegexOperation creates a new RegexOperation instance, where the delimiter is the tab character.
067   *
068   * @param fieldDeclaration of type Fields
069   */
070  @ConstructorProperties({"fieldDeclaration"})
071  public RegexSplitter( Fields fieldDeclaration )
072    {
073    super( 1, fieldDeclaration, "\t" );
074    }
075
076  /**
077   * Constructor RegexSplitter creates a new RegexSplitter instance.
078   *
079   * @param fieldDeclaration of type Fields
080   * @param patternString    of type String
081   */
082  @ConstructorProperties({"fieldDeclaration", "patternString"})
083  public RegexSplitter( Fields fieldDeclaration, String patternString )
084    {
085    super( 1, fieldDeclaration, patternString );
086    length = fieldDeclaration.isUnknown() ? -1 : fieldDeclaration.size();
087    }
088
089  @Override
090  public void prepare( FlowProcess flowProcess, OperationCall<Pair<Pattern, TupleEntry>> operationCall )
091    {
092    length = operationCall.getDeclaredFields().isUnknown() ? -1 : operationCall.getDeclaredFields().size();
093
094    TupleEntry tupleEntry = new TupleEntry( operationCall.getDeclaredFields(), Tuple.size( Math.max( 1, length ) ) );
095
096    operationCall.setContext( new Pair<>( getPattern(), tupleEntry ) );
097    }
098
099  @Override
100  public void operate( FlowProcess flowProcess, FunctionCall<Pair<Pattern, TupleEntry>> functionCall )
101    {
102    String value = functionCall.getArguments().getString( 0 );
103
104    if( value == null )
105      value = "";
106
107    TupleEntry output = functionCall.getContext().getRhs();
108
109    String[] split = functionCall.getContext().getLhs().split( value, length );
110
111    if( length == -1 )
112      {
113      output.getTuple().clear();
114
115      for( String element : split )
116        output.getTuple().add( element );
117      }
118    else
119      {
120      output.setCanonicalValues( split );
121      }
122
123    functionCall.getOutputCollector().add( output );
124    }
125  }