001/*
002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.operation;
023
024import cascading.flow.FlowProcess;
025import cascading.tuple.TupleEntry;
026
027/**
028 * An Aggregator takes the set of all values associated with a unique grouping and returns
029 * zero or more values. {@link cascading.operation.aggregator.MaxValue}, {@link cascading.operation.aggregator.MinValue},
030 * {@link cascading.operation.aggregator.Count}, and {@link cascading.operation.aggregator.Average} are good examples.
031 * <p>
032 * Aggregator implementations should be reentrant. There is no guarantee an Aggregator instance will be executed in a
033 * unique vm, or by a single thread. The {@link #start(cascading.flow.FlowProcess, AggregatorCall)}
034 * method provides a mechanism for maintaining a 'context' object to hold intermediate values.
035 * <p>
036 * Note {@link TupleEntry} instances are reused internally so should not be stored. Instead use the TupleEntry or Tuple
037 * copy constructors to make safe copies.
038 * <p>
039 * Since Aggregators can be chained, and Cascading pipelines all operation results, any Aggregators
040 * coming ahead of the current Aggregator must return a value before the {@link #complete(cascading.flow.FlowProcess, AggregatorCall)}
041 * method on this Aggregator is called. Subsequently, if any previous Aggregators return more than one Tuple result,
042 * this complete() method will be called for each Tuple emitted.
043 * <p>
044 * Thus it is a best practice to implement a {@link Buffer} when emitting more than one, or zero Tuple results.
045 *
046 * @see AggregatorCall
047 * @see OperationCall
048 */
049public interface Aggregator<Context> extends Operation<Context>
050  {
051  /**
052   * Method start initializes the aggregation procedure and is called for every unique grouping.
053   * <p>
054   * The AggregatorCall context should be initialized here if necessary.
055   * <p>
056   * The first time this method is called for a given 'process', the AggregatorCall context will be null. This method should
057   * set a new instance of the user defined context object. When the AggregatorCall context is not null, it is up to
058   * the developer to create a new instance, or 'recycle' the given instance. If recycled, it must be re-initialized to
059   * remove any previous state/values.
060   * <p>
061   * For example, if a Map is used to hold the intermediate data for each subsequent
062   * {@link #aggregate(cascading.flow.FlowProcess, AggregatorCall)} call,
063   * new HashMap() should be set on the AggregatorCall instance when {@link cascading.operation.AggregatorCall#getContext()} is null.
064   * On the next grouping, start() will be called again, but this time with the old Map instance. In this case,
065   * map.clear() should be invoked before returning.
066   *
067   * @param flowProcess    of type FlowProcess
068   * @param aggregatorCall of type AggregatorCall
069   */
070  void start( FlowProcess flowProcess, AggregatorCall<Context> aggregatorCall );
071
072  /**
073   * Method aggregate is called for each {@link TupleEntry} value in the current grouping.
074   * <p>
075   * TupleEntry entry, or entry.getTuple() should not be stored directly in the context. A copy of the tuple
076   * should be made via the {@code new Tuple( entry.getTuple() )} copy constructor.
077   *
078   * @param flowProcess    of type FlowProcess
079   * @param aggregatorCall of type AggregatorCall
080   */
081  void aggregate( FlowProcess flowProcess, AggregatorCall<Context> aggregatorCall );
082
083  /**
084   * Method complete will be issued last after every {@link TupleEntry} has been passed to the
085   * {@link #aggregate(cascading.flow.FlowProcess, AggregatorCall)}
086   * method.  Any final calculation should be completed here and passed to the outputCollector.
087   *
088   * @param flowProcess    of type FlowProcess
089   * @param aggregatorCall of type AggregatorCall
090   */
091  void complete( FlowProcess flowProcess, AggregatorCall<Context> aggregatorCall );
092  }