001/* 002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.operation.regex; 023 024import java.beans.ConstructorProperties; 025import java.util.Arrays; 026import java.util.regex.Matcher; 027 028import cascading.CascadingException; 029import cascading.flow.FlowProcess; 030import cascading.operation.Function; 031import cascading.operation.FunctionCall; 032import cascading.operation.OperationCall; 033import cascading.operation.OperationException; 034import cascading.tuple.Fields; 035import cascading.tuple.Tuple; 036import cascading.tuple.TupleEntry; 037import cascading.util.Pair; 038 039/** 040 * Class RegexParser is used to extract a matched regex from an incoming argument value. 041 * <p> 042 * RegexParser only expects one field value. If more than one argument value is passed, only the 043 * first is handled, the remainder are ignored. 044 * <p> 045 * Sometimes its useful to parse out a value from a key/value pair in a string, if the key exists. If the key does 046 * not exist, returning an empty string instead of failing is typically expected. 047 * <p> 048 * The following regex can extract a value from {@code key1=value1&key2=value2} if key1 exists, otherwise an 049 * empty string is returned:<br> 050 * {@code (?<=key1=)[^&]*|$} 051 * <p> 052 * Note a {@code null} valued argument passed to the parser will be converted to an empty string ({@code ""}) before 053 * the regex is applied. 054 * <p> 055 * Any Object value will be coerced to a String type if type information is provided. See the 056 * {@link cascading.tuple.type.CoercibleType} interface to control how custom Object types are converted to String 057 * values. 058 * <p> 059 * Also, any type information on the declaredFields will also be honored by coercing the parsed String value to the 060 * canonical declared type. This is useful when creating or using CoercibleType classes, like 061 * {@link cascading.tuple.type.DateType}. 062 */ 063public class RegexParser extends RegexOperation<Pair<Matcher, TupleEntry>> implements Function<Pair<Matcher, TupleEntry>> 064 { 065 /** Field groups */ 066 private int[] groups = null; 067 068 /** 069 * Constructor RegexParser creates a new RegexParser instance, where the argument Tuple value is matched and returned 070 * in a new Tuple. 071 * <p> 072 * If the given patternString declares regular expression groups, each group will be returned as a value in the 073 * resulting Tuple. If no groups are declared, the match will be returned as the only value in the resulting Tuple. 074 * <p> 075 * The fields returned will be {@link Fields#UNKNOWN}, so a variable number of values may be emitted based on the 076 * regular expression given. 077 * 078 * @param patternString of type String 079 */ 080 @ConstructorProperties({"patternString"}) 081 public RegexParser( String patternString ) 082 { 083 super( 1, patternString ); 084 } 085 086 /** 087 * Constructor RegexParser creates a new RegexParser instance, where the argument Tuple value is matched and returned 088 * as the given Field. 089 * <p> 090 * If the given patternString declares regular expression groups, each group will be returned as a value in the 091 * resulting Tuple. If no groups are declared, the match will be returned as the only value in the resulting Tuple. 092 * <p> 093 * If the number of fields in the fieldDeclaration does not match the number of groups matched, an {@link OperationException} 094 * will be thrown during runtime. 095 * <p> 096 * To overcome this, either use the constructors that take an array of groups, or use the {@code (?: ...)} sequence 097 * to tell the regular expression matcher to not capture the group. 098 * 099 * @param fieldDeclaration of type Fields 100 * @param patternString of type String 101 */ 102 @ConstructorProperties({"fieldDeclaration", "patternString"}) 103 public RegexParser( Fields fieldDeclaration, String patternString ) 104 { 105 super( 1, fieldDeclaration, patternString ); 106 } 107 108 /** 109 * Constructor RegexParser creates a new RegexParser instance, where the patternString is a regular expression 110 * with match groups and whose groups designated by {@code groups} are stored in the appropriate number of new fields. 111 * <p> 112 * The number of resulting fields will match the number of groups given ({@code groups.length}). 113 * 114 * @param patternString of type String 115 * @param groups of type int[] 116 */ 117 @ConstructorProperties({"patternString", "groups"}) 118 public RegexParser( String patternString, int... groups ) 119 { 120 super( 1, Fields.size( verifyReturnLength( groups ) ), patternString ); 121 122 this.groups = Arrays.copyOf( groups, groups.length ); 123 } 124 125 private static int verifyReturnLength( int[] groups ) 126 { 127 if( groups == null || groups.length == 0 ) 128 throw new IllegalArgumentException( "groups may not be null or 0 length" ); 129 130 return groups.length; 131 } 132 133 /** 134 * Constructor RegexParser creates a new RegexParser instance, where the patternString is a regular expression 135 * with match groups and whose groups designated by {@code groups} are stored in the named fieldDeclarations. 136 * 137 * @param fieldDeclaration of type Fields 138 * @param patternString of type String 139 * @param groups of type int[] 140 */ 141 @ConstructorProperties({"fieldDeclaration", "patternString", "groups"}) 142 public RegexParser( Fields fieldDeclaration, String patternString, int... groups ) 143 { 144 super( 1, fieldDeclaration, patternString ); 145 146 verifyReturnLength( groups ); 147 148 this.groups = Arrays.copyOf( groups, groups.length ); 149 150 if( !fieldDeclaration.isUnknown() && fieldDeclaration.size() != groups.length ) 151 throw new IllegalArgumentException( "fieldDeclaration must equal number of groups to be captured, fields: " + fieldDeclaration.print() ); 152 } 153 154 public int[] getGroups() 155 { 156 if( groups == null ) 157 return null; 158 159 return Arrays.copyOf( groups, groups.length ); 160 } 161 162 @Override 163 public void prepare( FlowProcess flowProcess, OperationCall<Pair<Matcher, TupleEntry>> operationCall ) 164 { 165 int size; 166 167 if( groups != null ) 168 size = groups.length; 169 else 170 size = operationCall.getDeclaredFields().size(); // if Fields.UNKNOWN size will be zero 171 172 // TupleEntry allows us to honor the declared field type information 173 TupleEntry entry = new TupleEntry( operationCall.getDeclaredFields(), Tuple.size( size ) ); 174 175 operationCall.setContext( new Pair<>( getPattern().matcher( "" ), entry ) ); 176 } 177 178 @Override 179 public void operate( FlowProcess flowProcess, FunctionCall<Pair<Matcher, TupleEntry>> functionCall ) 180 { 181 String value = functionCall.getArguments().getString( 0 ); 182 183 if( value == null ) 184 value = ""; 185 186 Matcher matcher = functionCall.getContext().getLhs().reset( value ); 187 188 if( !matcher.find() ) 189 throw new OperationException( "could not match pattern: [" + getPatternString() + "] with value: [" + value + "]" ); 190 191 TupleEntry output = functionCall.getContext().getRhs(); 192 193 if( groups != null ) 194 onGivenGroups( functionCall, matcher, output ); 195 else 196 onFoundGroups( functionCall, matcher, output ); 197 } 198 199 private void onFoundGroups( FunctionCall<Pair<Matcher, TupleEntry>> functionCall, Matcher matcher, TupleEntry output ) 200 { 201 int count = matcher.groupCount(); 202 203 // if UNKNOWN then the returned number fields will be of variable size 204 // subsequently we must clear the tuple, and add the found values 205 if( functionCall.getDeclaredFields().isUnknown() ) 206 addGroupsToTuple( matcher, output, count ); 207 else 208 setGroupsOnTuple( matcher, output, count ); 209 210 // this overcomes an issue in the planner resolver where if REPLACE is declared, the declared 211 // fields for the current operation are expected to match the argument fields 212 functionCall.getOutputCollector().add( output.getTuple() ); 213 } 214 215 private void setGroupsOnTuple( Matcher matcher, TupleEntry output, int count ) 216 { 217 if( count == 0 ) 218 { 219 try 220 { 221 output.setString( 0, matcher.group( 0 ) ); 222 } 223 catch( Exception exception ) 224 { 225 throw new CascadingException( "unable to set tuple value at field: " + output.getFields().get( 0 ) + ", from regex group: 0", exception ); 226 } 227 } 228 else 229 { 230 for( int i = 0; i < count; i++ ) 231 { 232 try 233 { 234 output.setString( i, matcher.group( i + 1 ) ); // skip group 0 235 } 236 catch( Exception exception ) 237 { 238 throw new CascadingException( "unable to set tuple value at field: " + output.getFields().get( i ) + ", from regex group: " + ( i + 1 ), exception ); 239 } 240 } 241 } 242 } 243 244 private void addGroupsToTuple( Matcher matcher, TupleEntry output, int count ) 245 { 246 Tuple tuple = output.getTuple(); 247 248 tuple.clear(); 249 250 if( count == 0 ) 251 { 252 tuple.add( matcher.group( 0 ) ); 253 } 254 else 255 { 256 for( int i = 0; i < count; i++ ) 257 tuple.add( matcher.group( i + 1 ) ); // skip group 0 258 } 259 } 260 261 private void onGivenGroups( FunctionCall<Pair<Matcher, TupleEntry>> functionCall, Matcher matcher, TupleEntry output ) 262 { 263 for( int i = 0; i < groups.length; i++ ) 264 output.setString( i, matcher.group( groups[ i ] ) ); 265 266 functionCall.getOutputCollector().add( output ); 267 } 268 269 @Override 270 public boolean equals( Object object ) 271 { 272 if( this == object ) 273 return true; 274 if( !( object instanceof RegexParser ) ) 275 return false; 276 if( !super.equals( object ) ) 277 return false; 278 279 RegexParser that = (RegexParser) object; 280 281 if( !Arrays.equals( groups, that.groups ) ) 282 return false; 283 284 return true; 285 } 286 287 @Override 288 public int hashCode() 289 { 290 int result = super.hashCode(); 291 result = 31 * result + ( groups != null ? Arrays.hashCode( groups ) : 0 ); 292 return result; 293 } 294 }