001/*
002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.operation.regex;
023
024import java.beans.ConstructorProperties;
025import java.util.Arrays;
026import java.util.regex.Matcher;
027
028import cascading.CascadingException;
029import cascading.flow.FlowProcess;
030import cascading.operation.Function;
031import cascading.operation.FunctionCall;
032import cascading.operation.OperationCall;
033import cascading.operation.OperationException;
034import cascading.tuple.Fields;
035import cascading.tuple.Tuple;
036import cascading.tuple.TupleEntry;
037import cascading.util.Pair;
038
039/**
040 * Class RegexParser is used to extract a matched regex from an incoming argument value.
041 * <p>
042 * RegexParser only expects one field value. If more than one argument value is passed, only the
043 * first is handled, the remainder are ignored.
044 * <p>
045 * Sometimes its useful to parse out a value from a key/value pair in a string, if the key exists. If the key does
046 * not exist, returning an empty string instead of failing is typically expected.
047 * <p>
048 * The following regex can extract a value from {@code key1=value1&key2=value2} if key1 exists, otherwise an
049 * empty string is returned:<br>
050 * {@code (?<=key1=)[^&]*|$}
051 * <p>
052 * Note a {@code null} valued argument passed to the parser will be converted to an empty string ({@code ""}) before
053 * the regex is applied.
054 * <p>
055 * Any Object value will be coerced to a String type if type information is provided. See the
056 * {@link cascading.tuple.type.CoercibleType} interface to control how custom Object types are converted to String
057 * values.
058 * <p>
059 * Also, any type information on the declaredFields will also be honored by coercing the parsed String value to the
060 * canonical declared type. This is useful when creating or using CoercibleType classes, like
061 * {@link cascading.tuple.type.DateType}.
062 */
063public class RegexParser extends RegexOperation<Pair<Matcher, TupleEntry>> implements Function<Pair<Matcher, TupleEntry>>
064  {
065  /** Field groups */
066  private int[] groups = null;
067
068  /**
069   * Constructor RegexParser creates a new RegexParser instance, where the argument Tuple value is matched and returned
070   * in a new Tuple.
071   * <p>
072   * If the given patternString declares regular expression groups, each group will be returned as a value in the
073   * resulting Tuple. If no groups are declared, the match will be returned as the only value in the resulting Tuple.
074   * <p>
075   * The fields returned will be {@link Fields#UNKNOWN}, so a variable number of values may be emitted based on the
076   * regular expression given.
077   *
078   * @param patternString of type String
079   */
080  @ConstructorProperties({"patternString"})
081  public RegexParser( String patternString )
082    {
083    super( 1, patternString );
084    }
085
086  /**
087   * Constructor RegexParser creates a new RegexParser instance, where the argument Tuple value is matched and returned
088   * as the given Field.
089   * <p>
090   * If the given patternString declares regular expression groups, each group will be returned as a value in the
091   * resulting Tuple. If no groups are declared, the match will be returned as the only value in the resulting Tuple.
092   * <p>
093   * If the number of fields in the fieldDeclaration does not match the number of groups matched, an {@link OperationException}
094   * will be thrown during runtime.
095   * <p>
096   * To overcome this, either use the constructors that take an array of groups, or use the {@code (?: ...)} sequence
097   * to tell the regular expression matcher to not capture the group.
098   *
099   * @param fieldDeclaration of type Fields
100   * @param patternString    of type String
101   */
102  @ConstructorProperties({"fieldDeclaration", "patternString"})
103  public RegexParser( Fields fieldDeclaration, String patternString )
104    {
105    super( 1, fieldDeclaration, patternString );
106    }
107
108  /**
109   * Constructor RegexParser creates a new RegexParser instance, where the patternString is a regular expression
110   * with match groups and whose groups designated by {@code groups} are stored in the appropriate number of new fields.
111   * <p>
112   * The number of resulting fields will match the number of groups given ({@code groups.length}).
113   *
114   * @param patternString of type String
115   * @param groups        of type int[]
116   */
117  @ConstructorProperties({"patternString", "groups"})
118  public RegexParser( String patternString, int... groups )
119    {
120    super( 1, Fields.size( verifyReturnLength( groups ) ), patternString );
121
122    this.groups = Arrays.copyOf( groups, groups.length );
123    }
124
125  private static int verifyReturnLength( int[] groups )
126    {
127    if( groups == null || groups.length == 0 )
128      throw new IllegalArgumentException( "groups may not be null or 0 length" );
129
130    return groups.length;
131    }
132
133  /**
134   * Constructor RegexParser creates a new RegexParser instance, where the patternString is a regular expression
135   * with match groups and whose groups designated by {@code groups} are stored in the named fieldDeclarations.
136   *
137   * @param fieldDeclaration of type Fields
138   * @param patternString    of type String
139   * @param groups           of type int[]
140   */
141  @ConstructorProperties({"fieldDeclaration", "patternString", "groups"})
142  public RegexParser( Fields fieldDeclaration, String patternString, int... groups )
143    {
144    super( 1, fieldDeclaration, patternString );
145
146    verifyReturnLength( groups );
147
148    this.groups = Arrays.copyOf( groups, groups.length );
149
150    if( !fieldDeclaration.isUnknown() && fieldDeclaration.size() != groups.length )
151      throw new IllegalArgumentException( "fieldDeclaration must equal number of groups to be captured, fields: " + fieldDeclaration.print() );
152    }
153
154  public int[] getGroups()
155    {
156    if( groups == null )
157      return null;
158
159    return Arrays.copyOf( groups, groups.length );
160    }
161
162  @Override
163  public void prepare( FlowProcess flowProcess, OperationCall<Pair<Matcher, TupleEntry>> operationCall )
164    {
165    int size;
166
167    if( groups != null )
168      size = groups.length;
169    else
170      size = operationCall.getDeclaredFields().size(); // if Fields.UNKNOWN size will be zero
171
172    // TupleEntry allows us to honor the declared field type information
173    TupleEntry entry = new TupleEntry( operationCall.getDeclaredFields(), Tuple.size( size ) );
174
175    operationCall.setContext( new Pair<>( getPattern().matcher( "" ), entry ) );
176    }
177
178  @Override
179  public void operate( FlowProcess flowProcess, FunctionCall<Pair<Matcher, TupleEntry>> functionCall )
180    {
181    String value = functionCall.getArguments().getString( 0 );
182
183    if( value == null )
184      value = "";
185
186    Matcher matcher = functionCall.getContext().getLhs().reset( value );
187
188    if( !matcher.find() )
189      throw new OperationException( "could not match pattern: [" + getPatternString() + "] with value: [" + value + "]" );
190
191    TupleEntry output = functionCall.getContext().getRhs();
192
193    if( groups != null )
194      onGivenGroups( functionCall, matcher, output );
195    else
196      onFoundGroups( functionCall, matcher, output );
197    }
198
199  private void onFoundGroups( FunctionCall<Pair<Matcher, TupleEntry>> functionCall, Matcher matcher, TupleEntry output )
200    {
201    int count = matcher.groupCount();
202
203    // if UNKNOWN then the returned number fields will be of variable size
204    // subsequently we must clear the tuple, and add the found values
205    if( functionCall.getDeclaredFields().isUnknown() )
206      addGroupsToTuple( matcher, output, count );
207    else
208      setGroupsOnTuple( matcher, output, count );
209
210    // this overcomes an issue in the planner resolver where if REPLACE is declared, the declared
211    // fields for the current operation are expected to match the argument fields
212    functionCall.getOutputCollector().add( output.getTuple() );
213    }
214
215  private void setGroupsOnTuple( Matcher matcher, TupleEntry output, int count )
216    {
217    if( count == 0 )
218      {
219      try
220        {
221        output.setString( 0, matcher.group( 0 ) );
222        }
223      catch( Exception exception )
224        {
225        throw new CascadingException( "unable to set tuple value at field: " + output.getFields().get( 0 ) + ", from regex group: 0", exception );
226        }
227      }
228    else
229      {
230      for( int i = 0; i < count; i++ )
231        {
232        try
233          {
234          output.setString( i, matcher.group( i + 1 ) ); // skip group 0
235          }
236        catch( Exception exception )
237          {
238          throw new CascadingException( "unable to set tuple value at field: " + output.getFields().get( i ) + ", from regex group: " + ( i + 1 ), exception );
239          }
240        }
241      }
242    }
243
244  private void addGroupsToTuple( Matcher matcher, TupleEntry output, int count )
245    {
246    Tuple tuple = output.getTuple();
247
248    tuple.clear();
249
250    if( count == 0 )
251      {
252      tuple.add( matcher.group( 0 ) );
253      }
254    else
255      {
256      for( int i = 0; i < count; i++ )
257        tuple.add( matcher.group( i + 1 ) ); // skip group 0
258      }
259    }
260
261  private void onGivenGroups( FunctionCall<Pair<Matcher, TupleEntry>> functionCall, Matcher matcher, TupleEntry output )
262    {
263    for( int i = 0; i < groups.length; i++ )
264      output.setString( i, matcher.group( groups[ i ] ) );
265
266    functionCall.getOutputCollector().add( output );
267    }
268
269  @Override
270  public boolean equals( Object object )
271    {
272    if( this == object )
273      return true;
274    if( !( object instanceof RegexParser ) )
275      return false;
276    if( !super.equals( object ) )
277      return false;
278
279    RegexParser that = (RegexParser) object;
280
281    if( !Arrays.equals( groups, that.groups ) )
282      return false;
283
284    return true;
285    }
286
287  @Override
288  public int hashCode()
289    {
290    int result = super.hashCode();
291    result = 31 * result + ( groups != null ? Arrays.hashCode( groups ) : 0 );
292    return result;
293    }
294  }