001/* 002 * Copyright (c) 2016-2017 Chris K Wensel. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.operation.regex; 023 024import java.beans.ConstructorProperties; 025import java.util.regex.Pattern; 026 027import cascading.flow.FlowProcess; 028import cascading.operation.Function; 029import cascading.operation.FunctionCall; 030import cascading.operation.OperationCall; 031import cascading.tuple.Fields; 032import cascading.tuple.Tuple; 033import cascading.tuple.TupleEntry; 034import cascading.util.Pair; 035 036/** 037 * Class RegexGenerator will emit a new Tuple for every split on the incoming argument value delimited by the given patternString. 038 * <p> 039 * RegexGenerator only expects one field value. If more than one argument value is passed, only the 040 * first is handled, the remainder are ignored. 041 * <p> 042 * This could be used to break a document into single word tuples for later processing for a word count. 043 * <p> 044 * Note a {@code null} valued argument passed to the parser will be converted to an empty string ({@code ""}) before 045 * the regex is applied. 046 * <p> 047 * Any Object value will be coerced to a String type if type information is provided. See the 048 * {@link cascading.tuple.type.CoercibleType} interface to control how custom Object types are converted to String 049 * values. 050 */ 051public class RegexSplitGenerator extends RegexOperation<Pair<Pattern, TupleEntry>> implements Function<Pair<Pattern, TupleEntry>> 052 { 053 /** 054 * Constructor RegexGenerator creates a new RegexGenerator instance. 055 * 056 * @param patternString of type String 057 */ 058 @ConstructorProperties({"patternString"}) 059 public RegexSplitGenerator( String patternString ) 060 { 061 super( 1, Fields.size( 1 ), patternString ); 062 } 063 064 /** 065 * Constructor RegexGenerator creates a new RegexGenerator instance. 066 * 067 * @param fieldDeclaration of type Fields 068 * @param patternString of type String 069 */ 070 @ConstructorProperties({"fieldDeclaration", "patternString"}) 071 public RegexSplitGenerator( Fields fieldDeclaration, String patternString ) 072 { 073 super( 1, fieldDeclaration, patternString ); 074 075 if( fieldDeclaration.size() != 1 ) 076 throw new IllegalArgumentException( "fieldDeclaration may only declare one field, was " + fieldDeclaration.print() ); 077 } 078 079 @Override 080 public void prepare( FlowProcess flowProcess, OperationCall<Pair<Pattern, TupleEntry>> operationCall ) 081 { 082 TupleEntry tupleEntry = new TupleEntry( operationCall.getDeclaredFields(), Tuple.size( 1 ) ); 083 084 operationCall.setContext( new Pair<>( getPattern(), tupleEntry ) ); 085 } 086 087 @Override 088 public void operate( FlowProcess flowProcess, FunctionCall<Pair<Pattern, TupleEntry>> functionCall ) 089 { 090 String value = functionCall.getArguments().getString( 0 ); 091 092 if( value == null ) 093 value = ""; 094 095 String[] split = functionCall.getContext().getLhs().split( value ); 096 097 for( String string : split ) 098 { 099 TupleEntry tupleEntry = functionCall.getContext().getRhs(); 100 101 tupleEntry.setString( 0, string ); 102 functionCall.getOutputCollector().add( tupleEntry ); 103 } 104 } 105 }