001    /*
002     * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
003     *
004     * Project and contact information: http://www.cascading.org/
005     *
006     * This file is part of the Cascading project.
007     *
008     * Licensed under the Apache License, Version 2.0 (the "License");
009     * you may not use this file except in compliance with the License.
010     * You may obtain a copy of the License at
011     *
012     *     http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing, software
015     * distributed under the License is distributed on an "AS IS" BASIS,
016     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017     * See the License for the specific language governing permissions and
018     * limitations under the License.
019     */
020    
021    package cascading.tap.hadoop;
022    
023    import java.beans.ConstructorProperties;
024    import java.io.IOException;
025    
026    import cascading.flow.FlowProcess;
027    import cascading.tap.BaseTemplateTap;
028    import cascading.tap.SinkMode;
029    import cascading.tap.Tap;
030    import cascading.tap.hadoop.io.TapOutputCollector;
031    import cascading.tuple.Fields;
032    import cascading.tuple.Tuple;
033    import cascading.tuple.TupleEntrySchemeCollector;
034    import org.apache.hadoop.mapred.JobConf;
035    import org.apache.hadoop.mapred.OutputCollector;
036    
037    /**
038     * Class TemplateTap can be used to write tuple streams out to sub-directories based on the values in the {@link Tuple}
039     * instance.
040     * <p/>
041     * The constructor takes a {@link Hfs} {@link cascading.tap.Tap} and a {@link java.util.Formatter} format syntax String. This allows
042     * Tuple values at given positions to be used as directory names. Note that Hadoop can only sink to directories, and
043     * all files in those directories are "part-xxxxx" files.
044     * <p/>
045     * {@code openTapsThreshold} limits the number of open files to be output to. This value defaults to 300 files.
046     * Each time the threshold is exceeded, 10% of the least recently used open files will be closed.
047     * <p/>
048     * TemplateTap will populate a given {@code pathTemplate} without regard to case of the values being used. Thus
049     * the resulting paths {@code 2012/June/} and {@code 2012/june/} will likely result in two open files into the same
050     * location. Forcing the case to be consistent with an upstream {@link cascading.operation.Function} is recommended, see
051     * {@link cascading.operation.expression.ExpressionFunction}.
052     * <p/>
053     * Though Hadoop has no mechanism to prevent simultaneous writes to a directory from multiple jobs, it doesn't mean
054     * its safe to do so. Same is true with the TemplateTap. Interleaving writes to a common parent (root) directory
055     * across multiple flows will very likely lead to data loss.
056     */
057    @Deprecated
058    public class TemplateTap extends BaseTemplateTap<JobConf, OutputCollector>
059      {
060      /**
061       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
062       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
063       *
064       * @param parent       of type Tap
065       * @param pathTemplate of type String
066       */
067      @ConstructorProperties({"parent", "pathTemplate"})
068      public TemplateTap( Hfs parent, String pathTemplate )
069        {
070        this( parent, pathTemplate, OPEN_TAPS_THRESHOLD_DEFAULT );
071        }
072    
073      /**
074       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
075       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
076       * <p/>
077       * {@code openTapsThreshold} limits the number of open files to be output to.
078       *
079       * @param parent            of type Hfs
080       * @param pathTemplate      of type String
081       * @param openTapsThreshold of type int
082       */
083      @ConstructorProperties({"parent", "pathTemplate", "openTapsThreshold"})
084      public TemplateTap( Hfs parent, String pathTemplate, int openTapsThreshold )
085        {
086        super( parent, pathTemplate, openTapsThreshold );
087        }
088    
089      /**
090       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
091       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
092       *
093       * @param parent       of type Tap
094       * @param pathTemplate of type String
095       * @param sinkMode     of type SinkMode
096       */
097      @ConstructorProperties({"parent", "pathTemplate", "sinkMode"})
098      public TemplateTap( Hfs parent, String pathTemplate, SinkMode sinkMode )
099        {
100        super( parent, pathTemplate, sinkMode );
101        }
102    
103      /**
104       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
105       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
106       * <p/>
107       * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)}
108       * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}.
109       *
110       * @param parent             of type Tap
111       * @param pathTemplate       of type String
112       * @param sinkMode           of type SinkMode
113       * @param keepParentOnDelete of type boolean
114       */
115      @ConstructorProperties({"parent", "pathTemplate", "sinkMode", "keepParentOnDelete"})
116      public TemplateTap( Hfs parent, String pathTemplate, SinkMode sinkMode, boolean keepParentOnDelete )
117        {
118        this( parent, pathTemplate, sinkMode, keepParentOnDelete, OPEN_TAPS_THRESHOLD_DEFAULT );
119        }
120    
121      /**
122       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
123       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
124       * <p/>
125       * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)}
126       * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}.
127       * <p/>
128       * {@code openTapsThreshold} limits the number of open files to be output to.
129       *
130       * @param parent             of type Tap
131       * @param pathTemplate       of type String
132       * @param sinkMode           of type SinkMode
133       * @param keepParentOnDelete of type boolean
134       * @param openTapsThreshold  of type int
135       */
136      @ConstructorProperties({"parent", "pathTemplate", "sinkMode", "keepParentOnDelete", "openTapsThreshold"})
137      public TemplateTap( Hfs parent, String pathTemplate, SinkMode sinkMode, boolean keepParentOnDelete, int openTapsThreshold )
138        {
139        super( parent, pathTemplate, sinkMode, keepParentOnDelete, openTapsThreshold );
140        }
141    
142      /**
143       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
144       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
145       * The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate.
146       * <p/>
147       * This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing
148       * data not in the result file to be used in the template path name.
149       *
150       * @param parent       of type Tap
151       * @param pathTemplate of type String
152       * @param pathFields   of type Fields
153       */
154      @ConstructorProperties({"parent", "pathTemplate", "pathFields"})
155      public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields )
156        {
157        this( parent, pathTemplate, pathFields, OPEN_TAPS_THRESHOLD_DEFAULT );
158        }
159    
160      /**
161       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
162       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
163       * The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate.
164       * <p/>
165       * This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing
166       * data not in the result file to be used in the template path name.
167       * <p/>
168       * {@code openTapsThreshold} limits the number of open files to be output to.
169       *
170       * @param parent            of type Hfs
171       * @param pathTemplate      of type String
172       * @param pathFields        of type Fields
173       * @param openTapsThreshold of type int
174       */
175      @ConstructorProperties({"parent", "pathTemplate", "pathFields", "openTapsThreshold"})
176      public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields, int openTapsThreshold )
177        {
178        super( parent, pathTemplate, pathFields, openTapsThreshold );
179        }
180    
181      /**
182       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
183       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
184       * The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate.
185       * <p/>
186       * This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing
187       * data not in the result file to be used in the template path name.
188       *
189       * @param parent       of type Tap
190       * @param pathTemplate of type String
191       * @param pathFields   of type Fields
192       * @param sinkMode     of type SinkMode
193       */
194      @ConstructorProperties({"parent", "pathTemplate", "pathFields", "sinkMode"})
195      public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields, SinkMode sinkMode )
196        {
197        super( parent, pathTemplate, pathFields, sinkMode );
198        }
199    
200      /**
201       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
202       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
203       * The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate.
204       * <p/>
205       * This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing
206       * data not in the result file to be used in the template path name.
207       * <p/>
208       * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)}
209       * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}.
210       *
211       * @param parent             of type Tap
212       * @param pathTemplate       of type String
213       * @param pathFields         of type Fields
214       * @param sinkMode           of type SinkMode
215       * @param keepParentOnDelete of type boolean
216       */
217      @ConstructorProperties({"parent", "pathTemplate", "pathFields", "sinkMode", "keepParentOnDelete"})
218      public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields, SinkMode sinkMode, boolean keepParentOnDelete )
219        {
220        this( parent, pathTemplate, pathFields, sinkMode, keepParentOnDelete, OPEN_TAPS_THRESHOLD_DEFAULT );
221        }
222    
223      /**
224       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
225       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
226       * The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate.
227       * <p/>
228       * This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing
229       * data not in the result file to be used in the template path name.
230       * <p/>
231       * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)}
232       * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}.
233       * <p/>
234       * {@code openTapsThreshold} limits the number of open files to be output to.
235       *
236       * @param parent             of type Hfs
237       * @param pathTemplate       of type String
238       * @param pathFields         of type Fields
239       * @param sinkMode           of type SinkMode
240       * @param keepParentOnDelete of type boolean
241       * @param openTapsThreshold  of type int
242       */
243      @ConstructorProperties({"parent", "pathTemplate", "pathFields", "sinkMode", "keepParentOnDelete",
244                              "openTapsThreshold"})
245      public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields, SinkMode sinkMode, boolean keepParentOnDelete, int openTapsThreshold )
246        {
247        super( parent, pathTemplate, pathFields, sinkMode, keepParentOnDelete, openTapsThreshold );
248        }
249    
250      @Override
251      protected TupleEntrySchemeCollector createTupleEntrySchemeCollector( FlowProcess<JobConf> flowProcess, Tap parent, String path ) throws IOException
252        {
253        TapOutputCollector outputCollector = new TapOutputCollector( flowProcess, parent, path );
254    
255        return new TupleEntrySchemeCollector<JobConf, OutputCollector>( flowProcess, parent, outputCollector );
256        }
257      }