/*-------------------------------------------------------------------------
*
* nbtsplitloc.c
* Choose split point code for Postgres btree implementation.
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/access/nbtree/nbtsplitloc.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/nbtree.h"
#include "storage/lmgr.h"
typedef struct
{
/* FindSplitData candidate split */
int leftfree;
int rightfree;
OffsetNumber firstoldonright;
bool newitemonleft;
IndexTuple lastleft_tuple;
IndexTuple firstright_tuple;
} SplitPoint;
typedef struct
{
/* context data for _bt_checksplitloc */
Relation rel;
bool is_leaf; /* T if splitting a leaf page */
OffsetNumber newitemoff; /* where the new item is to be inserted */
int leftspace; /* space available for items on left page */
int rightspace; /* space available for items on right page */
int dataitemstotal; /* space taken by all items, old and new */
int ncandidates; /* current number of split candidates */
SplitPoint *candidates; /* sorted by offset number */
/* context data for _bt_findbestsplitloc */
int optimal_natts;
bool all_duplicates;
double goodenough; /* good enough left/right space delta */
double propfullonleft; /* want propfullonleft * leftfree on left */
bool is_weighted; /* T if propfullonleft used by split */
SplitPoint *best_candidate;
double best_delta;
} FindSplitData;
static void _bt_checksplitloc(FindSplitData *state, int olddataitemstoleft,
OffsetNumber firstoldonright, bool newitemonleft,
IndexTuple lastleftitem, IndexTuple firstrightitem,
Size firstrightitemsz);
static void _bt_findbestsplitloc(FindSplitData *state, int left, int right, int level);
/*
* _bt_findsplitloc() -- find an appropriate place to split a page.
*
* The main goal here is to equalize the free space that will be on each
* split page, *after accounting for the inserted tuple*. (If we fail to
* account for it, we might find ourselves with too little room on the page
* that it needs to go into!)
*
* We are passed the intended insert position of the new tuple, expressed as
* the offsetnumber of the tuple it must go in front of. (This could be
* maxoff+1 if the tuple is to go at the end.)
*
* We return the index of the first existing tuple that should go on the
* righthand page, plus a boolean indicating whether the new tuple goes on
* the left or right page. The bool is necessary to disambiguate the case
* where firstright == newitemoff.
*
* The high key for the left page is formed using the first item on the
* right page, which may seem to be contrary to Lehman & Yao's approach of
* using the left page's last item as its new high key. It isn't, though;
* suffix truncation will leave the left page's high key equal to the last
* item on the left page when two tuples with equal key values enclose the
* split point. It's convenient to always express a split point as a
* firstright offset due to internal page splits, which leave us with a
* right half whose first item becomes a negative infinity item through
* truncation to 0 attributes. In effect, internal page splits store
* firstright's "separator" key at the end of the left page (as left's new
* high key), and store its downlink at the start of the right page. In
* other words, internal page splits conceptually split in the middle of the
* firstright tuple, not on either side of it. Crucially, when splitting
* either a leaf page or an internal page, the new high key will be strictly
* less than the first item on the right page in all cases, despite the fact
* that we start with the assumption that firstright becomes the new high
* key.
*/
OffsetNumber
_bt_findsplitloc(Relation rel,
Page page,
OffsetNumber newitemoff,
Size newitemsz,
IndexTuple newitem,
bool *newitemonleft)
{
BTPageOpaque opaque;
OffsetNumber offnum;
OffsetNumber firstoff;
OffsetNumber maxoff;
ItemId itemid;
FindSplitData state;
int leftspace,
rightspace,
olddataitemstotal,
dataitemstoleft,
leaffillfactor;
SplitPoint *candidates;
int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
OffsetNumber finalfirstright;
IndexTuple previtem;
/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
newitemsz += sizeof(ItemIdData);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
maxoff = PageGetMaxOffsetNumber(page);
/* Total free space available on a btree page, after fixed overhead */
leftspace = rightspace =
PageGetPageSize(page) - SizeOfPageHeaderData -
MAXALIGN(sizeof(BTPageOpaqueData));
/* The right page will have the same high key as the old page */
if (!P_RIGHTMOST(opaque))
{
itemid = PageGetItemId(page, P_HIKEY);
rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
sizeof(ItemIdData));
}
/* Count up total space in data items without actually scanning 'em */
olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
leaffillfactor = RelationGetFillFactor(rel, BTREE_DEFAULT_FILLFACTOR);
candidates = (SplitPoint *) palloc((maxoff + 1) * sizeof(SplitPoint));
state.rel = rel;
state.is_leaf = P_ISLEAF(opaque);
state.leftspace = leftspace;
state.rightspace = rightspace;
state.dataitemstotal = olddataitemstotal + newitemsz;
state.candidates = candidates;
state.ncandidates = 0;
/*
* Scan through the data items and calculate space usage for a split at
* each possible position. All the possible splits are recorded in
* 'state.candidates' array.
*/
firstoff = P_FIRSTDATAKEY(opaque);
if (newitemoff == firstoff)
{
previtem = newitem;
dataitemstoleft = newitemsz;
}
else
{
itemid = PageGetItemId(page, firstoff);
previtem = (IndexTuple) PageGetItem(page, itemid);
dataitemstoleft = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
}
for (offnum = firstoff + 1;
offnum <= maxoff;
offnum = OffsetNumberNext(offnum))
{
IndexTuple item;
Size itemsz;
itemid = PageGetItemId(page, offnum);
item = (IndexTuple) PageGetItem(page, itemid);
itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
/*
* Will the new item go to left or right of split?
*/
if (offnum < newitemoff)
{
_bt_checksplitloc(&state, dataitemstoleft, offnum, false, previtem, item, itemsz);
dataitemstoleft += itemsz;
previtem = item;
}
else if (offnum == newitemoff)
{
/* need to try it both ways! */
_bt_checksplitloc(&state, dataitemstoleft, offnum, false, previtem, newitem, newitemsz);
dataitemstoleft += newitemsz;
previtem = newitem;
_bt_checksplitloc(&state, dataitemstoleft, offnum, true, previtem, item, itemsz);
dataitemstoleft += itemsz;
previtem = item;
}
else
{
_bt_checksplitloc(&state, dataitemstoleft, offnum, true, previtem, item, itemsz);
dataitemstoleft += itemsz;
previtem = item;
}
}
/*
* If the new item goes as the last item, check for splitting so that all
* the old items go to the left page and the new item goes to the right
* page.
*/
if (newitemoff > maxoff)
_bt_checksplitloc(&state, olddataitemstotal, maxoff, false, previtem, newitem, newitemsz);
/*
* Try to select a point where a distinguishing attribute appears earlier
* in the new high key for the left side of the split, in order to
* maximize the number of trailing attributes that can be truncated away.
* This is even useful with pages that only have a single (non-TID)
* attribute, since it's helpful to avoid appending an explicit heap TID
* attribute to the new pivot tuple (high key/downlink) when it cannot
* actually be truncated. Note that it is always assumed that caller goes
* on to perform truncation, even with pg_upgrade'd indexes where that
* isn't actually the case. There is still a modest benefit to choosing a
* split location while weighing suffix truncation: the resulting
* (untruncated) pivot tuples are nevertheless more predictive of future
* space utilization.
*
* Among all tuple possible split points, with the maximum amount of
* suffix truncation, we choose a split point that balances the left and
* right pages the best. If the page is the rightmost page on its level,
* however, we instead try to arrange to leave the left split page
* fillfactor% full. In this way, when we are inserting successively
* increasing keys (consider sequences, timestamps, etc) we will end up
* with a tree whose pages are about fillfactor% full, instead of the 50%
* full result that we'd get without this special case. This is the same
* as nbtsort.c produces for a newly-created tree. Note that leaf and
* nonleaf pages use different fillfactors.
*
* If the page is entirely full of duplicates, we try to leave the left
* split page even more full than in the fillfactor% rightmost page case.
* This maximizes space utilization in cases where tuples with the same
* attribute values span across many pages. Newly inserted duplicates
* will tend to have higher heap TID values, so we'll end up splitting to
* the right in the manner of ascending insertions of monotonically
* increasing values. See nbtree/README for more information about suffix
* truncation, and how a split point is chosen.
*/
/*
* Compute how many attributes we need to keep, in the most distinguishing
* possible split among all the candidate splits. We do this by comparing
* the leftmost candidate's last-left item, with the rightmost candidate's
* first-right item. The assumption here is that any split point in
* between those must keep at least as many keys.
*
* XXX: That's not quite true, because _bt_keep_natts_fast() uses binary
* comparison, and thus isn't totally accurate!
*/
state.optimal_natts = _bt_keep_natts_fast(rel,
candidates[0].lastleft_tuple,
candidates[state.ncandidates - 1].firstright_tuple);
if (!state.is_leaf)
{
if (P_RIGHTMOST(opaque))
{
state.propfullonleft = BTREE_NONLEAF_FILLFACTOR / 100.0;
state.is_weighted = true;
}
else
{
state.propfullonleft = 0.5;
state.is_weighted = false;
}
}
else if (state.optimal_natts > indnkeyatts)
{
/* The page is full of duplicates. */
state.propfullonleft = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
state.is_weighted = true;
}
else if (P_RIGHTMOST(opaque))
{
state.propfullonleft = leaffillfactor / 100.0;
state.is_weighted = true;
}
else
{
state.propfullonleft = 0.5;
state.is_weighted = false;
}
state.all_duplicates = (state.optimal_natts > indnkeyatts);
/*
* Finding the best possible split would require checking all the possible
* split points, because of the high-key and left-key special cases.
* That's probably more work than it's worth; instead, stop as soon as we
* find a sufficiently "good-enough" split, where good-enough is defined
* as an imbalance in free space of no more than pagesize/16
* (arbitrary...) This should let us stop near the middle on most pages,
* instead of plowing through all candidates.
*/
state.goodenough = leftspace / 16;
/*
* Find the best (or good enough) split among the candidates.
*/
state.best_candidate = NULL;
state.best_delta = INT_MAX;
_bt_findbestsplitloc(&state, 0, state.ncandidates - 1, 0);
/*
* I believe it is not possible to fail to find a feasible split, but just
* in case ...
*/
if (!state.best_candidate)
elog(ERROR, "could not find a feasible split point for index \"%s\"",
RelationGetRelationName(rel));
finalfirstright = state.best_candidate->firstoldonright;
*newitemonleft = state.best_candidate->newitemonleft;
pfree(state.candidates);
return finalfirstright;
}
/*
* Subroutine to analyze a particular possible split choice (ie, firstright
* and newitemonleft settings), and record it in state->candidates, if a
* split is possible at this point.
*
* firstoldonright is the offset of the first item on the original page
* that goes to the right page. firstoldonright can be > max offset, which
* means that all the old items go to the left page and only the new item goes
* to the right page.
*
* lastleftitem and firstrightitem are the tuples, between which the split
* is made. firstrightitemsz is the size of 'firstrightitem'.
*
* dataitemstoleft is the total size of all items on the left page,
* firstoldonright.
*
* Returns delta between space that will be left free on left and right side
* of split.
*/
static void
_bt_checksplitloc(FindSplitData *state,
int dataitemstoleft,
OffsetNumber firstoldonright,
bool newitemonleft,
IndexTuple lastleftitem,
IndexTuple firstrightitem,
Size firstrightitemsz)
{
int leftfree,
rightfree;
/* Account for all the tuples */
leftfree = state->leftspace - dataitemstoleft;
rightfree = state->rightspace -
(state->dataitemstotal - dataitemstoleft);
/*
* The first item on the right page becomes the high key of the left page;
* therefore it counts against left space as well as right space (we
* cannot assume that suffix truncation will make it any smaller). When
* index has included attributes, then those attributes of left page high
* key will be truncated leaving that page with slightly more free space.
* However, that shouldn't affect our ability to find valid split
* location, since we err in the direction of being pessimistic about free
* space on the left half. Besides, even when suffix truncation of
* non-TID attributes occurs, there often won't be an entire MAXALIGN()
* quantum in pivot space savings.
*
* If we are on the leaf level, assume that suffix truncation cannot avoid
* adding a heap TID to the left half's new high key when splitting at the
* leaf level. In practice the new high key will often be smaller and
* will rarely be larger, but conservatively assume the worst case.
*/
if (state->is_leaf)
leftfree -= (int) (firstrightitemsz +
MAXALIGN(sizeof(ItemPointerData)));
else
leftfree -= (int) firstrightitemsz;
/*
* If we are not on the leaf level, we will be able to discard the key
* data from the first item that winds up on the right page.
*/
if (!state->is_leaf)
rightfree += (int) firstrightitemsz -
(int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
if (leftfree >= 0 && rightfree >= 0)
{
/* this split is possible, memorize it */
state->candidates[state->ncandidates].leftfree = leftfree;
state->candidates[state->ncandidates].rightfree = rightfree;
state->candidates[state->ncandidates].firstoldonright = firstoldonright;
state->candidates[state->ncandidates].newitemonleft = newitemonleft;
state->candidates[state->ncandidates].lastleft_tuple = lastleftitem;
state->candidates[state->ncandidates].firstright_tuple = firstrightitem;
state->ncandidates++;
}
}
/*
* Recursive helper function, to find the best split location among the
* candidates in given range (inclusive).
*
* The best candidate is one that has needs the least amount of "distinguishing"
* keys. The caller already calculated that, in 'state.optimal_natts'.
* We find the candidate with the smallest (or "good enough") delta value,
* among the ones with the optimal number of distinguishing keys.
*
* We try to optimize for both properties at the same time. First, we use
* _bt_keep_natts_fast() to search for the position, or positions, where
* there are enough distinguishing keys. We do that recursively, like in a
* binary search, and whenever we find a branch with duplicates, we skip
* processing that half. If a page only has a few places that satisfy the
* "distinguishing keys" requirement, that zooms in quite quickly to the
* those locations.
*
* _bt_keep_natts_fast() is relatively expensive, though, so once we have
* "zoomed in" to a small range (32 items or less), we scan the range
* sequentially, looking for the smallest delta first, and only checking
* _bt_keep_natts_fast() if the current candidate's delta is smaller than
* the previous best. That avoids wasting a lot of effort when almost
* all split points satisfy the distinguishing keys requirement.
*/
static void
_bt_findbestsplitloc(FindSplitData *state, int left, int right, int level)
{
SplitPoint *left_candidate = &state->candidates[left];
SplitPoint *right_candidate = &state->candidates[right];
int center;
int this_natts;
/*
* First check if there is any split point in this range, with the optimal
* distinguishness. If not, skip this range.
*
* For example, if we found out earlier, that there is a split point on
* the page, such the left and half pages would differ on the first key
* column, but all the values in the range we're considering have the same
* value on the first column, then there must be a better split point
* elsewhere.
*/
if (level > 0 && !state->all_duplicates)
{
this_natts = _bt_keep_natts_fast(state->rel,
left_candidate->lastleft_tuple,
right_candidate->firstright_tuple);
if (this_natts > state->optimal_natts)
return;
}
#if 0
elog(NOTICE, "recurse %d: %4d - %4d", level, left, right);
#endif
/*
* Once we have a sufficiently small range left, scan all the entries,
* looking for the split with best balance between the pages (i.e.
* smallest delta)
*/
if (right - left < 32)
{
int i;
for (i = left; i <= right; i++)
{
SplitPoint *candidate = &state->candidates[i];
int leftfree = candidate->leftfree;
int rightfree = candidate->rightfree;
double delta;
if (state->is_weighted)
delta = state->propfullonleft * leftfree -
(1.0 - state->propfullonleft) * rightfree;
else
delta = leftfree - rightfree;
if (delta < 0)
delta = -delta;
if (delta < state->best_delta)
{
/*
* This candidate is better balanced than the previous best.
* But is it optimally distinguishing?
*/
bool distinguishing_enough;
if (state->all_duplicates)
distinguishing_enough = true;
else
{
this_natts = _bt_keep_natts_fast(state->rel,
candidate->lastleft_tuple,
candidate->firstright_tuple);
if (this_natts <= state->optimal_natts)
distinguishing_enough = true;
else
distinguishing_enough = false;
}
#if 0
elog(NOTICE, "candidate %3ld: %4d - %4d delta %f this_natts %d",
candidate - state->candidates,
candidate->leftfree,
candidate->rightfree,
delta, this_natts);
#endif
if (distinguishing_enough)
{
state->best_delta = delta;
state->best_candidate = candidate;
if (state->best_delta <= state->goodenough)
return;
}
}
}
}
else
{
/*
* Recurse into both halves. As an optimization, use the goal
* fillfactor as a guide on which half to recurse first. For example,
* if we're aiming for a 50/50 split, try the splits around the
* midpoint first. If we manage to find a "good enough" split early,
* we can skip scanning the rest. This is approximate, as we look for
* the midpoint of the split candidates, rather than free space, but
* it's close in the common case that the tuples are roughly the same
* size.
*/
center = (left + right) / 2;
if (state->ncandidates * state->propfullonleft < center)
{
_bt_findbestsplitloc(state, left, center, level + 1);
if (state->best_delta <= state->goodenough)
return;
_bt_findbestsplitloc(state, center + 1, right, level + 1);
}
else
{
_bt_findbestsplitloc(state, center + 1, right, level + 1);
if (state->best_delta <= state->goodenough)
return;
_bt_findbestsplitloc(state, left, center, level + 1);
}
}
}