001 /* 002 * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021 package cascading.operation.regex; 022 023 import java.beans.ConstructorProperties; 024 import java.util.Arrays; 025 import java.util.regex.Matcher; 026 027 import cascading.flow.FlowProcess; 028 import cascading.operation.Function; 029 import cascading.operation.FunctionCall; 030 import cascading.operation.OperationCall; 031 import cascading.operation.OperationException; 032 import cascading.tuple.Fields; 033 import cascading.tuple.Tuple; 034 import cascading.util.Pair; 035 036 /** 037 * Class RegexParser is used to extract a matched regex from an incoming argument value. 038 * <p/> 039 * RegexParser only expects one field value. If more than one argument value is passed, only the 040 * first is handled, the remainder are ignored. 041 * <p/> 042 * Sometimes its useful to parse out a value from a key/value pair in a string, if the key exists. If the key does 043 * not exist, returning an empty string instead of failing is typically expected. 044 * <p/> 045 * The following regex can extract a value from {@code key1=value1&key2=value2} if key1 exists, otherwise an 046 * empty string is returned:<br/> 047 * <pre>(?<=key1=)[^&]*|$</pre> 048 * <p/> 049 * Note a {@code null} valued argument passed to the parser will be converted to an empty string ({@code ""}) before 050 * the regex is applied. 051 * <p/> 052 * Any Object value will be coerced to a String type if type information is provided. See the 053 * {@link cascading.tuple.type.CoercibleType} interface to control how custom Object types are converted to String 054 * values. 055 */ 056 public class RegexParser extends RegexOperation<Pair<Matcher, Tuple>> implements Function<Pair<Matcher, Tuple>> 057 { 058 /** Field groups */ 059 private int[] groups = null; 060 061 /** 062 * Constructor RegexParser creates a new RegexParser instance, where the argument Tuple value is matched and returned 063 * in a new Tuple. 064 * <p/> 065 * If the given patternString declares regular expression groups, each group will be returned as a value in the 066 * resulting Tuple. If no groups are declared, the match will be returned as the only value in the resulting Tuple. 067 * <p/> 068 * The fields returned will be {@link Fields#UNKNOWN}, so a variable number of values may be emitted based on the 069 * regular expression given. 070 * 071 * @param patternString of type String 072 */ 073 @ConstructorProperties({"patternString"}) 074 public RegexParser( String patternString ) 075 { 076 super( 1, patternString ); 077 } 078 079 /** 080 * Constructor RegexParser creates a new RegexParser instance, where the argument Tuple value is matched and returned 081 * as the given Field. 082 * <p/> 083 * If the given patternString declares regular expression groups, each group will be returned as a value in the 084 * resulting Tuple. If no groups are declared, the match will be returned as the only value in the resulting Tuple. 085 * <p/> 086 * If the number of fields in the fieldDeclaration does not match the number of groups matched, an {@link OperationException} 087 * will be thrown during runtime. 088 * <p/> 089 * To overcome this, either use the constructors that take an array of groups, or use the {@code (?: ...)} sequence 090 * to tell the regular expression matcher to not capture the group. 091 * 092 * @param fieldDeclaration of type Fields 093 * @param patternString of type String 094 */ 095 @ConstructorProperties({"fieldDeclaration", "patternString"}) 096 public RegexParser( Fields fieldDeclaration, String patternString ) 097 { 098 super( 1, fieldDeclaration, patternString ); 099 } 100 101 /** 102 * Constructor RegexParser creates a new RegexParser instance, where the patternString is a regular expression 103 * with match groups and whose groups designated by {@code groups} are stored in the appropriate number of new fields. 104 * <p/> 105 * The number of resulting fields will match the number of groups given ({@code groups.length}). 106 * 107 * @param patternString of type String 108 * @param groups of type int[] 109 */ 110 @ConstructorProperties({"patternString", "groups"}) 111 public RegexParser( String patternString, int[] groups ) 112 { 113 super( 1, Fields.size( verifyReturnLength( groups ) ), patternString ); 114 115 this.groups = Arrays.copyOf( groups, groups.length ); 116 } 117 118 private static int verifyReturnLength( int[] groups ) 119 { 120 if( groups == null || groups.length == 0 ) 121 throw new IllegalArgumentException( "groups may not be null or 0 length" ); 122 123 return groups.length; 124 } 125 126 /** 127 * Constructor RegexParser creates a new RegexParser instance, where the patternString is a regular expression 128 * with match groups and whose groups designated by {@code groups} are stored in the named fieldDeclarations. 129 * 130 * @param fieldDeclaration of type Fields 131 * @param patternString of type String 132 * @param groups of type int[] 133 */ 134 @ConstructorProperties({"fieldDeclaration", "patternString", "groups"}) 135 public RegexParser( Fields fieldDeclaration, String patternString, int[] groups ) 136 { 137 super( 1, fieldDeclaration, patternString ); 138 139 verifyReturnLength( groups ); 140 141 this.groups = Arrays.copyOf( groups, groups.length ); 142 143 if( !fieldDeclaration.isUnknown() && fieldDeclaration.size() != groups.length ) 144 throw new IllegalArgumentException( "fieldDeclaration must equal number of groups to be captured, fields: " + fieldDeclaration.print() ); 145 } 146 147 public int[] getGroups() 148 { 149 if( groups == null ) 150 return null; 151 152 return Arrays.copyOf( groups, groups.length ); 153 } 154 155 @Override 156 public void prepare( FlowProcess flowProcess, OperationCall<Pair<Matcher, Tuple>> operationCall ) 157 { 158 operationCall.setContext( new Pair<Matcher, Tuple>( getPattern().matcher( "" ), new Tuple() ) ); 159 } 160 161 @Override 162 public void operate( FlowProcess flowProcess, FunctionCall<Pair<Matcher, Tuple>> functionCall ) 163 { 164 String value = functionCall.getArguments().getString( 0 ); 165 166 if( value == null ) 167 value = ""; 168 169 Matcher matcher = functionCall.getContext().getLhs().reset( value ); 170 171 if( !matcher.find() ) 172 throw new OperationException( "could not match pattern: [" + getPatternString() + "] with value: [" + value + "]" ); 173 174 Tuple output = functionCall.getContext().getRhs(); 175 176 output.clear(); 177 178 if( groups != null ) 179 onGivenGroups( functionCall, matcher, output ); 180 else 181 onFoundGroups( functionCall, matcher, output ); 182 } 183 184 private final void onFoundGroups( FunctionCall<Pair<Matcher, Tuple>> functionCall, Matcher matcher, Tuple output ) 185 { 186 int count = matcher.groupCount(); 187 188 if( count == 0 ) 189 { 190 output.add( matcher.group( 0 ) ); 191 } 192 else 193 { 194 for( int i = 0; i < count; i++ ) 195 output.add( matcher.group( i + 1 ) ); // skip group 0 196 } 197 198 functionCall.getOutputCollector().add( output ); 199 } 200 201 private final void onGivenGroups( FunctionCall<Pair<Matcher, Tuple>> functionCall, Matcher matcher, Tuple output ) 202 { 203 for( int pos : groups ) 204 output.add( matcher.group( pos ) ); 205 206 functionCall.getOutputCollector().add( output ); 207 } 208 209 @Override 210 public boolean equals( Object object ) 211 { 212 if( this == object ) 213 return true; 214 if( !( object instanceof RegexParser ) ) 215 return false; 216 if( !super.equals( object ) ) 217 return false; 218 219 RegexParser that = (RegexParser) object; 220 221 if( !Arrays.equals( groups, that.groups ) ) 222 return false; 223 224 return true; 225 } 226 227 @Override 228 public int hashCode() 229 { 230 int result = super.hashCode(); 231 result = 31 * result + ( groups != null ? Arrays.hashCode( groups ) : 0 ); 232 return result; 233 } 234 }