001/* 002 * Copyright (c) 2016-2017 Chris K Wensel. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.operation.regex; 023 024import java.beans.ConstructorProperties; 025import java.util.regex.Pattern; 026 027import cascading.flow.FlowProcess; 028import cascading.operation.Function; 029import cascading.operation.FunctionCall; 030import cascading.operation.OperationCall; 031import cascading.tuple.Fields; 032import cascading.tuple.Tuple; 033import cascading.tuple.TupleEntry; 034import cascading.util.Pair; 035 036/** 037 * Class RegexSplitter will split an incoming argument value by the given regex delimiter patternString. 038 * <p> 039 * RegexSplitter only expects one field value. If more than one argument value is passed, only the 040 * first is handled, the remainder are ignored. 041 * <p> 042 * Note a {@code null} valued argument passed to the parser will be converted to an empty string ({@code ""}) before 043 * the regex is applied. 044 * <p> 045 * Any Object value will be coerced to a String type if type information is provided. See the 046 * {@link cascading.tuple.type.CoercibleType} interface to control how custom Object types are converted to String 047 * values. 048 */ 049public class RegexSplitter extends RegexOperation<Pair<Pattern, TupleEntry>> implements Function<Pair<Pattern, TupleEntry>> 050 { 051 private int length; 052 053 /** 054 * Constructor RegexSplitter creates a new RegexSplitter instance. 055 * 056 * @param patternString of type String 057 */ 058 @ConstructorProperties({"patternString"}) 059 public RegexSplitter( String patternString ) 060 { 061 super( 1, patternString ); 062 length = fieldDeclaration.isUnknown() ? -1 : fieldDeclaration.size(); 063 } 064 065 /** 066 * Constructor RegexOperation creates a new RegexOperation instance, where the delimiter is the tab character. 067 * 068 * @param fieldDeclaration of type Fields 069 */ 070 @ConstructorProperties({"fieldDeclaration"}) 071 public RegexSplitter( Fields fieldDeclaration ) 072 { 073 super( 1, fieldDeclaration, "\t" ); 074 } 075 076 /** 077 * Constructor RegexSplitter creates a new RegexSplitter instance. 078 * 079 * @param fieldDeclaration of type Fields 080 * @param patternString of type String 081 */ 082 @ConstructorProperties({"fieldDeclaration", "patternString"}) 083 public RegexSplitter( Fields fieldDeclaration, String patternString ) 084 { 085 super( 1, fieldDeclaration, patternString ); 086 length = fieldDeclaration.isUnknown() ? -1 : fieldDeclaration.size(); 087 } 088 089 @Override 090 public void prepare( FlowProcess flowProcess, OperationCall<Pair<Pattern, TupleEntry>> operationCall ) 091 { 092 length = operationCall.getDeclaredFields().isUnknown() ? -1 : operationCall.getDeclaredFields().size(); 093 094 TupleEntry tupleEntry = new TupleEntry( operationCall.getDeclaredFields(), Tuple.size( Math.max( 1, length ) ) ); 095 096 operationCall.setContext( new Pair<>( getPattern(), tupleEntry ) ); 097 } 098 099 @Override 100 public void operate( FlowProcess flowProcess, FunctionCall<Pair<Pattern, TupleEntry>> functionCall ) 101 { 102 String value = functionCall.getArguments().getString( 0 ); 103 104 if( value == null ) 105 value = ""; 106 107 TupleEntry output = functionCall.getContext().getRhs(); 108 109 String[] split = functionCall.getContext().getLhs().split( value, length ); 110 111 if( length == -1 ) 112 { 113 output.getTuple().clear(); 114 115 for( String element : split ) 116 output.getTuple().add( element ); 117 } 118 else 119 { 120 output.setCanonicalValues( split ); 121 } 122 123 functionCall.getOutputCollector().add( output ); 124 } 125 }