001/*
002 * Copyright (c) 2007-2016 Concurrent, Inc. All Rights Reserved.
003 *
004 * Project and contact information: http://www.cascading.org/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading.operation.regex;
022
023import java.beans.ConstructorProperties;
024import java.util.Arrays;
025import java.util.regex.Matcher;
026
027import cascading.flow.FlowProcess;
028import cascading.operation.Function;
029import cascading.operation.FunctionCall;
030import cascading.operation.OperationCall;
031import cascading.operation.OperationException;
032import cascading.tuple.Fields;
033import cascading.tuple.Tuple;
034import cascading.util.Pair;
035
036/**
037 * Class RegexParser is used to extract a matched regex from an incoming argument value.
038 * <p/>
039 * RegexParser only expects one field value. If more than one argument value is passed, only the
040 * first is handled, the remainder are ignored.
041 * <p/>
042 * Sometimes its useful to parse out a value from a key/value pair in a string, if the key exists. If the key does
043 * not exist, returning an empty string instead of failing is typically expected.
044 * <p/>
045 * The following regex can extract a value from {@code key1=value1&key2=value2} if key1 exists, otherwise an
046 * empty string is returned:<br/>
047 * <pre>(?<=key1=)[^&]*|$</pre>
048 * <p/>
049 * Note a {@code null} valued argument passed to the parser will be converted to an empty string ({@code ""}) before
050 * the regex is applied.
051 * <p/>
052 * Any Object value will be coerced to a String type if type information is provided. See the
053 * {@link cascading.tuple.type.CoercibleType} interface to control how custom Object types are converted to String
054 * values.
055 */
056public class RegexParser extends RegexOperation<Pair<Matcher, Tuple>> implements Function<Pair<Matcher, Tuple>>
057  {
058  /** Field groups */
059  private int[] groups = null;
060
061  /**
062   * Constructor RegexParser creates a new RegexParser instance, where the argument Tuple value is matched and returned
063   * in a new Tuple.
064   * <p/>
065   * If the given patternString declares regular expression groups, each group will be returned as a value in the
066   * resulting Tuple. If no groups are declared, the match will be returned as the only value in the resulting Tuple.
067   * <p/>
068   * The fields returned will be {@link Fields#UNKNOWN}, so a variable number of values may be emitted based on the
069   * regular expression given.
070   *
071   * @param patternString of type String
072   */
073  @ConstructorProperties({"patternString"})
074  public RegexParser( String patternString )
075    {
076    super( 1, patternString );
077    }
078
079  /**
080   * Constructor RegexParser creates a new RegexParser instance, where the argument Tuple value is matched and returned
081   * as the given Field.
082   * <p/>
083   * If the given patternString declares regular expression groups, each group will be returned as a value in the
084   * resulting Tuple. If no groups are declared, the match will be returned as the only value in the resulting Tuple.
085   * <p/>
086   * If the number of fields in the fieldDeclaration does not match the number of groups matched, an {@link OperationException}
087   * will be thrown during runtime.
088   * <p/>
089   * To overcome this, either use the constructors that take an array of groups, or use the {@code (?: ...)} sequence
090   * to tell the regular expression matcher to not capture the group.
091   *
092   * @param fieldDeclaration of type Fields
093   * @param patternString    of type String
094   */
095  @ConstructorProperties({"fieldDeclaration", "patternString"})
096  public RegexParser( Fields fieldDeclaration, String patternString )
097    {
098    super( 1, fieldDeclaration, patternString );
099    }
100
101  /**
102   * Constructor RegexParser creates a new RegexParser instance, where the patternString is a regular expression
103   * with match groups and whose groups designated by {@code groups} are stored in the appropriate number of new fields.
104   * <p/>
105   * The number of resulting fields will match the number of groups given ({@code groups.length}).
106   *
107   * @param patternString of type String
108   * @param groups        of type int[]
109   */
110  @ConstructorProperties({"patternString", "groups"})
111  public RegexParser( String patternString, int[] groups )
112    {
113    super( 1, Fields.size( verifyReturnLength( groups ) ), patternString );
114
115    this.groups = Arrays.copyOf( groups, groups.length );
116    }
117
118  private static int verifyReturnLength( int[] groups )
119    {
120    if( groups == null || groups.length == 0 )
121      throw new IllegalArgumentException( "groups may not be null or 0 length" );
122
123    return groups.length;
124    }
125
126  /**
127   * Constructor RegexParser creates a new RegexParser instance, where the patternString is a regular expression
128   * with match groups and whose groups designated by {@code groups} are stored in the named fieldDeclarations.
129   *
130   * @param fieldDeclaration of type Fields
131   * @param patternString    of type String
132   * @param groups           of type int[]
133   */
134  @ConstructorProperties({"fieldDeclaration", "patternString", "groups"})
135  public RegexParser( Fields fieldDeclaration, String patternString, int[] groups )
136    {
137    super( 1, fieldDeclaration, patternString );
138
139    verifyReturnLength( groups );
140
141    this.groups = Arrays.copyOf( groups, groups.length );
142
143    if( !fieldDeclaration.isUnknown() && fieldDeclaration.size() != groups.length )
144      throw new IllegalArgumentException( "fieldDeclaration must equal number of groups to be captured, fields: " + fieldDeclaration.print() );
145    }
146
147  public int[] getGroups()
148    {
149    if( groups == null )
150      return null;
151
152    return Arrays.copyOf( groups, groups.length );
153    }
154
155  @Override
156  public void prepare( FlowProcess flowProcess, OperationCall<Pair<Matcher, Tuple>> operationCall )
157    {
158    operationCall.setContext( new Pair<Matcher, Tuple>( getPattern().matcher( "" ), new Tuple() ) );
159    }
160
161  @Override
162  public void operate( FlowProcess flowProcess, FunctionCall<Pair<Matcher, Tuple>> functionCall )
163    {
164    String value = functionCall.getArguments().getString( 0 );
165
166    if( value == null )
167      value = "";
168
169    Matcher matcher = functionCall.getContext().getLhs().reset( value );
170
171    if( !matcher.find() )
172      throw new OperationException( "could not match pattern: [" + getPatternString() + "] with value: [" + value + "]" );
173
174    Tuple output = functionCall.getContext().getRhs();
175
176    output.clear();
177
178    if( groups != null )
179      onGivenGroups( functionCall, matcher, output );
180    else
181      onFoundGroups( functionCall, matcher, output );
182    }
183
184  private final void onFoundGroups( FunctionCall<Pair<Matcher, Tuple>> functionCall, Matcher matcher, Tuple output )
185    {
186    int count = matcher.groupCount();
187
188    if( count == 0 )
189      {
190      output.add( matcher.group( 0 ) );
191      }
192    else
193      {
194      for( int i = 0; i < count; i++ )
195        output.add( matcher.group( i + 1 ) ); // skip group 0
196      }
197
198    functionCall.getOutputCollector().add( output );
199    }
200
201  private final void onGivenGroups( FunctionCall<Pair<Matcher, Tuple>> functionCall, Matcher matcher, Tuple output )
202    {
203    for( int pos : groups )
204      output.add( matcher.group( pos ) );
205
206    functionCall.getOutputCollector().add( output );
207    }
208
209  @Override
210  public boolean equals( Object object )
211    {
212    if( this == object )
213      return true;
214    if( !( object instanceof RegexParser ) )
215      return false;
216    if( !super.equals( object ) )
217      return false;
218
219    RegexParser that = (RegexParser) object;
220
221    if( !Arrays.equals( groups, that.groups ) )
222      return false;
223
224    return true;
225    }
226
227  @Override
228  public int hashCode()
229    {
230    int result = super.hashCode();
231    result = 31 * result + ( groups != null ? Arrays.hashCode( groups ) : 0 );
232    return result;
233    }
234  }