3131
3232import com .google .common .collect .ImmutableList ;
3333
34+ import org .apache .drill .exec .vector .ValueVector ;
3435import org .apache .drill .exec .vector .complex .RepeatedValueVector ;
3536import org .slf4j .Logger ;
3637import org .slf4j .LoggerFactory ;
3738
3839public abstract class FlattenTemplate implements Flattener {
3940 private static final Logger logger = LoggerFactory .getLogger (FlattenTemplate .class );
4041
41- private static final int OUTPUT_BATCH_SIZE = 4 *1024 ;
42- private static final int OUTPUT_MEMORY_LIMIT = 512 * 1024 * 1024 ;
42+ private static final int OUTPUT_ROW_COUNT = ValueVector .MAX_ROW_COUNT ;
4343
4444 private ImmutableList <TransferPair > transfers ;
4545 private BufferAllocator outputAllocator ;
4646 private SelectionVectorMode svMode ;
4747 private RepeatedValueVector fieldToFlatten ;
4848 private RepeatedValueVector .RepeatedAccessor accessor ;
4949 private int valueIndex ;
50- private boolean bigRecords = false ;
51- private int bigRecordsBufferSize ;
5250
5351 /**
54- * The output batch limit starts at OUTPUT_BATCH_SIZE , but may be decreased
52+ * The output batch limit starts at OUTPUT_ROW_COUNT , but may be decreased
5553 * if records are found to be large.
5654 */
57- private int outputLimit = OUTPUT_BATCH_SIZE ;
55+ private int outputLimit = OUTPUT_ROW_COUNT ;
5856
5957 // this allows for groups to be written between batches if we run out of space, for cases where we have finished
6058 // a batch on the boundary it will be set to 0
@@ -72,6 +70,11 @@ public RepeatedValueVector getFlattenField() {
7270 return fieldToFlatten ;
7371 }
7472
73+ @ Override
74+ public void setOutputCount (int outputCount ) {
75+ outputLimit = outputCount ;
76+ }
77+
7578 @ Override
7679 public final int flattenRecords (final int recordCount , final int firstOutputIndex ,
7780 final Flattener .Monitor monitor ) {
@@ -101,75 +104,10 @@ public final int flattenRecords(final int recordCount, final int firstOutputInde
101104 for ( ; innerValueIndexLocal < innerValueCount ; innerValueIndexLocal ++) {
102105 // If we've hit the batch size limit, stop and flush what we've got so far.
103106 if (recordsThisCall == outputLimit ) {
104- if (bigRecords ) {
105- /*
106- * We got to the limit we used before, but did we go over
107- * the bigRecordsBufferSize in the second half of the batch? If
108- * so, we'll need to adjust the batch limits.
109- */
110- adjustBatchLimits (1 , monitor , recordsThisCall );
111- }
112-
113107 // Flush this batch.
114108 break outer ;
115109 }
116110
117- /*
118- * At the moment, the output record includes the input record, so for very
119- * large records that we're flattening, we're carrying forward the original
120- * record as well as the flattened element. We've seen a case where flattening a 4MB
121- * record with a 20,000 element array causing memory usage to explode. To avoid
122- * that until we can push down the selected fields to operators like this, we
123- * also limit the amount of memory in use at one time.
124- *
125- * We have to have written at least one record to be able to get a buffer that will
126- * have a real allocator, so we have to do this lazily. We won't check the limit
127- * for the first two records, but that keeps this simple.
128- */
129- if (bigRecords ) {
130- /*
131- * If we're halfway through the outputLimit, check on our memory
132- * usage so far.
133- */
134- if (recordsThisCall == outputLimit / 2 ) {
135- /*
136- * If we've used more than half the space we've used for big records
137- * in the past, we've seen even bigger records than before, so stop and
138- * see if we need to flush here before we go over bigRecordsBufferSize
139- * memory usage, and reduce the outputLimit further before we continue
140- * with the next batch.
141- */
142- if (adjustBatchLimits (2 , monitor , recordsThisCall )) {
143- break outer ;
144- }
145- }
146- } else {
147- if (outputAllocator .getAllocatedMemory () > OUTPUT_MEMORY_LIMIT ) {
148- /*
149- * We're dealing with big records. Reduce the outputLimit to
150- * the current record count, and take note of how much space the
151- * vectors report using for that. We'll use those numbers as limits
152- * going forward in order to avoid allocating more memory.
153- */
154- bigRecords = true ;
155- outputLimit = Math .min (recordsThisCall , outputLimit );
156- if (outputLimit < 1 ) {
157- throw new IllegalStateException ("flatten outputLimit (" + outputLimit
158- + ") won't make progress" );
159- }
160-
161- /*
162- * This will differ from what the allocator reports because of
163- * overhead. But the allocator check is much cheaper to do, so we
164- * only compute this at selected times.
165- */
166- bigRecordsBufferSize = monitor .getBufferSizeFor (recordsThisCall );
167-
168- // Stop and flush.
169- break outer ;
170- }
171- }
172-
173111 try {
174112 doEval (valueIndexLocal , outputIndex );
175113 } catch (OversizedAllocationException ex ) {
@@ -211,68 +149,6 @@ public final int flattenRecords(final int recordCount, final int firstOutputInde
211149 }
212150 }
213151
214- /**
215- * Determine if the current batch record limit needs to be adjusted (when handling
216- * bigRecord mode). If so, adjust the limit, and return true, otherwise return false.
217- *
218- * <p>If the limit is adjusted, it will always be adjusted down, because we need to operate
219- * based on the largest sized record we've ever seen.</p>
220- *
221- * <p>If the limit is adjusted, then the current batch should be flushed, because
222- * continuing would lead to going over the large memory limit that has already been
223- * established.</p>
224- *
225- * @param multiplier Multiply currently used memory (according to the monitor) before
226- * checking against past memory limits. This allows for checking the currently used
227- * memory after processing a fraction of the expected batch limit, but using that as
228- * a predictor of the full batch's size. For example, if this is checked after half
229- * the batch size limit's records are processed, then using a multiplier of two will
230- * do the check under the assumption that processing the full batch limit will use
231- * twice as much memory.
232- * @param monitor the Flattener.Monitor instance to use for the current memory usage check
233- * @param recordsThisCall the number of records processed so far during this call to
234- * flattenRecords().
235- * @return true if the batch size limit was adjusted, false otherwise
236- */
237- private boolean adjustBatchLimits (final int multiplier , final Flattener .Monitor monitor ,
238- final int recordsThisCall ) {
239- assert bigRecords : "adjusting batch limits when no big records" ;
240- final int bufferSize = multiplier * monitor .getBufferSizeFor (recordsThisCall );
241-
242- /*
243- * If the amount of space we've used so far is below the amount that triggered
244- * the bigRecords mode, then no adjustment is needed.
245- */
246- if (bufferSize <= bigRecordsBufferSize ) {
247- return false ;
248- }
249-
250- /*
251- * We've used more space than we've used for big records in the past, we've seen
252- * even bigger records, so we need to adjust our limits, and flush what we've got so far.
253- *
254- * We should reduce the outputLimit proportionately to get the predicted
255- * amount of memory used back down to bigRecordsBufferSize.
256- *
257- * The number of records to limit is therefore
258- * outputLimit *
259- * (1 - (bufferSize - bigRecordsBufferSize) / bigRecordsBufferSize)
260- *
261- * Doing some algebra on the multiplier:
262- * (bigRecordsBufferSize - (bufferSize - bigRecordsBufferSize)) / bigRecordsBufferSize
263- * (bigRecordsBufferSize - bufferSize + bigRecordsBufferSize) / bigRecordsBufferSize
264- * (2 * bigRecordsBufferSize - bufferSize) / bigRecordsBufferSize
265- *
266- * If bufferSize has gotten so big that this would be negative, we'll
267- * just go down to one record per batch. We need to check for that on
268- * outputLimit anyway, in order to make sure that we make progress.
269- */
270- final int newLimit = (int )
271- (outputLimit * (2.0 * ((double ) bigRecordsBufferSize ) - bufferSize ) / bigRecordsBufferSize );
272- outputLimit = Math .max (1 , newLimit );
273- return true ;
274- }
275-
276152 @ Override
277153 public final void setup (FragmentContext context , RecordBatch incoming , RecordBatch outgoing , List <TransferPair > transfers ) throws SchemaChangeException {
278154
0 commit comments