001    /*
002     * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
003     *
004     * Project and contact information: http://www.cascading.org/
005     *
006     * This file is part of the Cascading project.
007     *
008     * Licensed under the Apache License, Version 2.0 (the "License");
009     * you may not use this file except in compliance with the License.
010     * You may obtain a copy of the License at
011     *
012     *     http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing, software
015     * distributed under the License is distributed on an "AS IS" BASIS,
016     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017     * See the License for the specific language governing permissions and
018     * limitations under the License.
019     */
020    
021    package cascading.operation;
022    
023    import cascading.flow.FlowProcess;
024    import cascading.tuple.TupleEntry;
025    
026    /**
027     * An Aggregator takes the set of all values associated with a unique grouping and returns
028     * zero or more values. {@link cascading.operation.aggregator.MaxValue}, {@link cascading.operation.aggregator.MinValue},
029     * {@link cascading.operation.aggregator.Count}, and {@link cascading.operation.aggregator.Average} are good examples.
030     * <p/>
031     * Aggregator implementations should be reentrant. There is no guarantee an Aggregator instance will be executed in a
032     * unique vm, or by a single thread. The {@link #start(cascading.flow.FlowProcess, AggregatorCall)}
033     * method provides a mechanism for maintaining a 'context' object to hold intermediate values.
034     * <p/>
035     * Note {@link TupleEntry} instances are reused internally so should not be stored. Instead use the TupleEntry or Tuple
036     * copy constructors to make safe copies.
037     * <p/>
038     * Since Aggregators can be chained, and Cascading pipelines all operation results, any Aggregators
039     * coming ahead of the current Aggregator must return a value before the {@link #complete(cascading.flow.FlowProcess, AggregatorCall)}
040     * method on this Aggregator is called. Subsequently, if any previous Aggregators return more than one Tuple result,
041     * this complete() method will be called for each Tuple emitted.
042     * <p/>
043     * Thus it is a best practice to implement a {@link Buffer} when emitting more than one, or zero Tuple results.
044     *
045     * @see AggregatorCall
046     * @see OperationCall
047     */
048    public interface Aggregator<Context> extends Operation<Context>
049      {
050      /**
051       * Method start initializes the aggregation procedure and is called for every unique grouping.
052       * <p/>
053       * The AggregatorCall context should be initialized here if necessary.
054       * <p/>
055       * The first time this method is called for a given 'process', the AggregatorCall context will be null. This method should
056       * set a new instance of the user defined context object. When the AggregatorCall context is not null, it is up to
057       * the developer to create a new instance, or 'recycle' the given instance. If recycled, it must be re-initialized to
058       * remove any previous state/values.
059       * <p/>
060       * For example, if a Map is used to hold the intermediate data for each subsequent
061       * {@link #aggregate(cascading.flow.FlowProcess, AggregatorCall)} call,
062       * new HashMap() should be set on the AggregatorCall instance when {@link cascading.operation.AggregatorCall#getContext()} is null.
063       * On the next grouping, start() will be called again, but this time with the old Map instance. In this case,
064       * map.clear() should be invoked before returning.
065       *
066       * @param flowProcess    of type FlowProcess
067       * @param aggregatorCall of type AggregatorCall
068       */
069      void start( FlowProcess flowProcess, AggregatorCall<Context> aggregatorCall );
070    
071      /**
072       * Method aggregate is called for each {@link TupleEntry} value in the current grouping.
073       * <p/>
074       * TupleEntry entry, or entry.getTuple() should not be stored directly in the context. A copy of the tuple
075       * should be made via the {@code new Tuple( entry.getTuple() )} copy constructor.
076       *
077       * @param flowProcess    of type FlowProcess
078       * @param aggregatorCall of type AggregatorCall
079       */
080      void aggregate( FlowProcess flowProcess, AggregatorCall<Context> aggregatorCall );
081    
082      /**
083       * Method complete will be issued last after every {@link TupleEntry} has been passed to the
084       * {@link #aggregate(cascading.flow.FlowProcess, AggregatorCall)}
085       * method.  Any final calculation should be completed here and passed to the outputCollector.
086       *
087       * @param flowProcess    of type FlowProcess
088       * @param aggregatorCall of type AggregatorCall
089       */
090      void complete( FlowProcess flowProcess, AggregatorCall<Context> aggregatorCall );
091      }