001 /* 002 * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021 package cascading.operation.regex; 022 023 import java.beans.ConstructorProperties; 024 import java.util.regex.Pattern; 025 026 import cascading.flow.FlowProcess; 027 import cascading.operation.Function; 028 import cascading.operation.FunctionCall; 029 import cascading.operation.OperationCall; 030 import cascading.tuple.Fields; 031 import cascading.tuple.Tuple; 032 import cascading.util.Pair; 033 034 /** 035 * Class RegexGenerator will emit a new Tuple for every split on the incoming argument value delimited by the given patternString. 036 * <p/> 037 * RegexGenerator only expects one field value. If more than one argument value is passed, only the 038 * first is handled, the remainder are ignored. 039 * <p/> 040 * This could be used to break a document into single word tuples for later processing for a word count. 041 * <p/> 042 * Note a {@code null} valued argument passed to the parser will be converted to an empty string ({@code ""}) before 043 * the regex is applied. 044 * <p/> 045 * Any Object value will be coerced to a String type if type information is provided. See the 046 * {@link cascading.tuple.type.CoercibleType} interface to control how custom Object types are converted to String 047 * values. 048 */ 049 public class RegexSplitGenerator extends RegexOperation<Pair<Pattern, Tuple>> implements Function<Pair<Pattern, Tuple>> 050 { 051 /** 052 * Constructor RegexGenerator creates a new RegexGenerator instance. 053 * 054 * @param patternString of type String 055 */ 056 @ConstructorProperties({"patternString"}) 057 public RegexSplitGenerator( String patternString ) 058 { 059 super( 1, Fields.size( 1 ), patternString ); 060 } 061 062 /** 063 * Constructor RegexGenerator creates a new RegexGenerator instance. 064 * 065 * @param fieldDeclaration of type Fields 066 * @param patternString of type String 067 */ 068 @ConstructorProperties({"fieldDeclaration", "patternString"}) 069 public RegexSplitGenerator( Fields fieldDeclaration, String patternString ) 070 { 071 super( 1, fieldDeclaration, patternString ); 072 073 if( fieldDeclaration.size() != 1 ) 074 throw new IllegalArgumentException( "fieldDeclaration may only declare one field, was " + fieldDeclaration.print() ); 075 } 076 077 @Override 078 public void prepare( FlowProcess flowProcess, OperationCall<Pair<Pattern, Tuple>> operationCall ) 079 { 080 operationCall.setContext( new Pair<Pattern, Tuple>( getPattern(), Tuple.size( 1 ) ) ); 081 } 082 083 @Override 084 public void operate( FlowProcess flowProcess, FunctionCall<Pair<Pattern, Tuple>> functionCall ) 085 { 086 String value = functionCall.getArguments().getString( 0 ); 087 088 if( value == null ) 089 value = ""; 090 091 String[] split = functionCall.getContext().getLhs().split( value ); 092 093 for( String string : split ) 094 { 095 functionCall.getContext().getRhs().set( 0, string ); 096 functionCall.getOutputCollector().add( functionCall.getContext().getRhs() ); 097 } 098 } 099 }