001    /*
002     * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
003     *
004     * Project and contact information: http://www.cascading.org/
005     *
006     * This file is part of the Cascading project.
007     *
008     * Licensed under the Apache License, Version 2.0 (the "License");
009     * you may not use this file except in compliance with the License.
010     * You may obtain a copy of the License at
011     *
012     *     http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing, software
015     * distributed under the License is distributed on an "AS IS" BASIS,
016     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017     * See the License for the specific language governing permissions and
018     * limitations under the License.
019     */
020    
021    package cascading.operation.regex;
022    
023    import java.beans.ConstructorProperties;
024    import java.util.Arrays;
025    import java.util.regex.Matcher;
026    
027    import cascading.flow.FlowProcess;
028    import cascading.operation.Function;
029    import cascading.operation.FunctionCall;
030    import cascading.operation.OperationCall;
031    import cascading.operation.OperationException;
032    import cascading.tuple.Fields;
033    import cascading.tuple.Tuple;
034    import cascading.util.Pair;
035    
036    /**
037     * Class RegexParser is used to extract a matched regex from an incoming argument value.
038     * <p/>
039     * RegexParser only expects one field value. If more than one argument value is passed, only the
040     * first is handled, the remainder are ignored.
041     * <p/>
042     * Sometimes its useful to parse out a value from a key/value pair in a string, if the key exists. If the key does
043     * not exist, returning an empty string instead of failing is typically expected.
044     * <p/>
045     * The following regex can extract a value from {@code key1=value1&key2=value2} if key1 exists, otherwise an
046     * empty string is returned:<br/>
047     * <pre>(?<=key1=)[^&]*|$</pre>
048     * <p/>
049     * Note a {@code null} valued argument passed to the parser will be converted to an empty string ({@code ""}) before
050     * the regex is applied.
051     * <p/>
052     * Any Object value will be coerced to a String type if type information is provided. See the
053     * {@link cascading.tuple.type.CoercibleType} interface to control how custom Object types are converted to String
054     * values.
055     */
056    public class RegexParser extends RegexOperation<Pair<Matcher, Tuple>> implements Function<Pair<Matcher, Tuple>>
057      {
058      /** Field groups */
059      private int[] groups = null;
060    
061      /**
062       * Constructor RegexParser creates a new RegexParser instance, where the argument Tuple value is matched and returned
063       * in a new Tuple.
064       * <p/>
065       * If the given patternString declares regular expression groups, each group will be returned as a value in the
066       * resulting Tuple. If no groups are declared, the match will be returned as the only value in the resulting Tuple.
067       * <p/>
068       * The fields returned will be {@link Fields#UNKNOWN}, so a variable number of values may be emitted based on the
069       * regular expression given.
070       *
071       * @param patternString of type String
072       */
073      @ConstructorProperties({"patternString"})
074      public RegexParser( String patternString )
075        {
076        super( 1, patternString );
077        }
078    
079      /**
080       * Constructor RegexParser creates a new RegexParser instance, where the argument Tuple value is matched and returned
081       * as the given Field.
082       * <p/>
083       * If the given patternString declares regular expression groups, each group will be returned as a value in the
084       * resulting Tuple. If no groups are declared, the match will be returned as the only value in the resulting Tuple.
085       * <p/>
086       * If the number of fields in the fieldDeclaration does not match the number of groups matched, an {@link OperationException}
087       * will be thrown during runtime.
088       * <p/>
089       * To overcome this, either use the constructors that take an array of groups, or use the {@code (?: ...)} sequence
090       * to tell the regular expression matcher to not capture the group.
091       *
092       * @param fieldDeclaration of type Fields
093       * @param patternString    of type String
094       */
095      @ConstructorProperties({"fieldDeclaration", "patternString"})
096      public RegexParser( Fields fieldDeclaration, String patternString )
097        {
098        super( 1, fieldDeclaration, patternString );
099        }
100    
101      /**
102       * Constructor RegexParser creates a new RegexParser instance, where the patternString is a regular expression
103       * with match groups and whose groups designated by {@code groups} are stored in the appropriate number of new fields.
104       * <p/>
105       * The number of resulting fields will match the number of groups given ({@code groups.length}).
106       *
107       * @param patternString of type String
108       * @param groups        of type int[]
109       */
110      @ConstructorProperties({"patternString", "groups"})
111      public RegexParser( String patternString, int[] groups )
112        {
113        super( 1, Fields.size( verifyReturnLength( groups ) ), patternString );
114    
115        this.groups = Arrays.copyOf( groups, groups.length );
116        }
117    
118      private static int verifyReturnLength( int[] groups )
119        {
120        if( groups == null || groups.length == 0 )
121          throw new IllegalArgumentException( "groups may not be null or 0 length" );
122    
123        return groups.length;
124        }
125    
126      /**
127       * Constructor RegexParser creates a new RegexParser instance, where the patternString is a regular expression
128       * with match groups and whose groups designated by {@code groups} are stored in the named fieldDeclarations.
129       *
130       * @param fieldDeclaration of type Fields
131       * @param patternString    of type String
132       * @param groups           of type int[]
133       */
134      @ConstructorProperties({"fieldDeclaration", "patternString", "groups"})
135      public RegexParser( Fields fieldDeclaration, String patternString, int[] groups )
136        {
137        super( 1, fieldDeclaration, patternString );
138    
139        verifyReturnLength( groups );
140    
141        this.groups = Arrays.copyOf( groups, groups.length );
142    
143        if( !fieldDeclaration.isUnknown() && fieldDeclaration.size() != groups.length )
144          throw new IllegalArgumentException( "fieldDeclaration must equal number of groups to be captured, fields: " + fieldDeclaration.print() );
145        }
146    
147      public int[] getGroups()
148        {
149        if( groups == null )
150          return null;
151    
152        return Arrays.copyOf( groups, groups.length );
153        }
154    
155      @Override
156      public void prepare( FlowProcess flowProcess, OperationCall<Pair<Matcher, Tuple>> operationCall )
157        {
158        operationCall.setContext( new Pair<Matcher, Tuple>( getPattern().matcher( "" ), new Tuple() ) );
159        }
160    
161      @Override
162      public void operate( FlowProcess flowProcess, FunctionCall<Pair<Matcher, Tuple>> functionCall )
163        {
164        String value = functionCall.getArguments().getString( 0 );
165    
166        if( value == null )
167          value = "";
168    
169        Matcher matcher = functionCall.getContext().getLhs().reset( value );
170    
171        if( !matcher.find() )
172          throw new OperationException( "could not match pattern: [" + getPatternString() + "] with value: [" + value + "]" );
173    
174        Tuple output = functionCall.getContext().getRhs();
175    
176        output.clear();
177    
178        if( groups != null )
179          onGivenGroups( functionCall, matcher, output );
180        else
181          onFoundGroups( functionCall, matcher, output );
182        }
183    
184      private final void onFoundGroups( FunctionCall<Pair<Matcher, Tuple>> functionCall, Matcher matcher, Tuple output )
185        {
186        int count = matcher.groupCount();
187    
188        if( count == 0 )
189          {
190          output.add( matcher.group( 0 ) );
191          }
192        else
193          {
194          for( int i = 0; i < count; i++ )
195            output.add( matcher.group( i + 1 ) ); // skip group 0
196          }
197    
198        functionCall.getOutputCollector().add( output );
199        }
200    
201      private final void onGivenGroups( FunctionCall<Pair<Matcher, Tuple>> functionCall, Matcher matcher, Tuple output )
202        {
203        for( int pos : groups )
204          output.add( matcher.group( pos ) );
205    
206        functionCall.getOutputCollector().add( output );
207        }
208    
209      @Override
210      public boolean equals( Object object )
211        {
212        if( this == object )
213          return true;
214        if( !( object instanceof RegexParser ) )
215          return false;
216        if( !super.equals( object ) )
217          return false;
218    
219        RegexParser that = (RegexParser) object;
220    
221        if( !Arrays.equals( groups, that.groups ) )
222          return false;
223    
224        return true;
225        }
226    
227      @Override
228      public int hashCode()
229        {
230        int result = super.hashCode();
231        result = 31 * result + ( groups != null ? Arrays.hashCode( groups ) : 0 );
232        return result;
233        }
234      }