000001 /* 000002 ** 2004 April 6 000003 ** 000004 ** The author disclaims copyright to this source code. In place of 000005 ** a legal notice, here is a blessing: 000006 ** 000007 ** May you do good and not evil. 000008 ** May you find forgiveness for yourself and forgive others. 000009 ** May you share freely, never taking more than you give. 000010 ** 000011 ************************************************************************* 000012 ** This file implements an external (disk-based) database using BTrees. 000013 ** See the header comment on "btreeInt.h" for additional information. 000014 ** Including a description of file format and an overview of operation. 000015 */ 000016 #include "btreeInt.h" 000017 000018 /* 000019 ** The header string that appears at the beginning of every 000020 ** SQLite database. 000021 */ 000022 static const char zMagicHeader[] = SQLITE_FILE_HEADER; 000023 000024 /* 000025 ** Set this global variable to 1 to enable tracing using the TRACE 000026 ** macro. 000027 */ 000028 #if 0 000029 int sqlite3BtreeTrace=1; /* True to enable tracing */ 000030 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);} 000031 #else 000032 # define TRACE(X) 000033 #endif 000034 000035 /* 000036 ** Extract a 2-byte big-endian integer from an array of unsigned bytes. 000037 ** But if the value is zero, make it 65536. 000038 ** 000039 ** This routine is used to extract the "offset to cell content area" value 000040 ** from the header of a btree page. If the page size is 65536 and the page 000041 ** is empty, the offset should be 65536, but the 2-byte value stores zero. 000042 ** This routine makes the necessary adjustment to 65536. 000043 */ 000044 #define get2byteNotZero(X) (((((int)get2byte(X))-1)&0xffff)+1) 000045 000046 /* 000047 ** Values passed as the 5th argument to allocateBtreePage() 000048 */ 000049 #define BTALLOC_ANY 0 /* Allocate any page */ 000050 #define BTALLOC_EXACT 1 /* Allocate exact page if possible */ 000051 #define BTALLOC_LE 2 /* Allocate any page <= the parameter */ 000052 000053 /* 000054 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not 000055 ** defined, or 0 if it is. For example: 000056 ** 000057 ** bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum); 000058 */ 000059 #ifndef SQLITE_OMIT_AUTOVACUUM 000060 #define IfNotOmitAV(expr) (expr) 000061 #else 000062 #define IfNotOmitAV(expr) 0 000063 #endif 000064 000065 #ifndef SQLITE_OMIT_SHARED_CACHE 000066 /* 000067 ** A list of BtShared objects that are eligible for participation 000068 ** in shared cache. This variable has file scope during normal builds, 000069 ** but the test harness needs to access it so we make it global for 000070 ** test builds. 000071 ** 000072 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MAIN. 000073 */ 000074 #ifdef SQLITE_TEST 000075 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 000076 #else 000077 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 000078 #endif 000079 #endif /* SQLITE_OMIT_SHARED_CACHE */ 000080 000081 #ifndef SQLITE_OMIT_SHARED_CACHE 000082 /* 000083 ** Enable or disable the shared pager and schema features. 000084 ** 000085 ** This routine has no effect on existing database connections. 000086 ** The shared cache setting effects only future calls to 000087 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2(). 000088 */ 000089 int sqlite3_enable_shared_cache(int enable){ 000090 sqlite3GlobalConfig.sharedCacheEnabled = enable; 000091 return SQLITE_OK; 000092 } 000093 #endif 000094 000095 000096 000097 #ifdef SQLITE_OMIT_SHARED_CACHE 000098 /* 000099 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(), 000100 ** and clearAllSharedCacheTableLocks() 000101 ** manipulate entries in the BtShared.pLock linked list used to store 000102 ** shared-cache table level locks. If the library is compiled with the 000103 ** shared-cache feature disabled, then there is only ever one user 000104 ** of each BtShared structure and so this locking is not necessary. 000105 ** So define the lock related functions as no-ops. 000106 */ 000107 #define querySharedCacheTableLock(a,b,c) SQLITE_OK 000108 #define setSharedCacheTableLock(a,b,c) SQLITE_OK 000109 #define clearAllSharedCacheTableLocks(a) 000110 #define downgradeAllSharedCacheTableLocks(a) 000111 #define hasSharedCacheTableLock(a,b,c,d) 1 000112 #define hasReadConflicts(a, b) 0 000113 #endif 000114 000115 #ifdef SQLITE_DEBUG 000116 /* 000117 ** Return and reset the seek counter for a Btree object. 000118 */ 000119 sqlite3_uint64 sqlite3BtreeSeekCount(Btree *pBt){ 000120 u64 n = pBt->nSeek; 000121 pBt->nSeek = 0; 000122 return n; 000123 } 000124 #endif 000125 000126 /* 000127 ** Implementation of the SQLITE_CORRUPT_PAGE() macro. Takes a single 000128 ** (MemPage*) as an argument. The (MemPage*) must not be NULL. 000129 ** 000130 ** If SQLITE_DEBUG is not defined, then this macro is equivalent to 000131 ** SQLITE_CORRUPT_BKPT. Or, if SQLITE_DEBUG is set, then the log message 000132 ** normally produced as a side-effect of SQLITE_CORRUPT_BKPT is augmented 000133 ** with the page number and filename associated with the (MemPage*). 000134 */ 000135 #ifdef SQLITE_DEBUG 000136 int corruptPageError(int lineno, MemPage *p){ 000137 char *zMsg; 000138 sqlite3BeginBenignMalloc(); 000139 zMsg = sqlite3_mprintf("database corruption page %u of %s", 000140 p->pgno, sqlite3PagerFilename(p->pBt->pPager, 0) 000141 ); 000142 sqlite3EndBenignMalloc(); 000143 if( zMsg ){ 000144 sqlite3ReportError(SQLITE_CORRUPT, lineno, zMsg); 000145 } 000146 sqlite3_free(zMsg); 000147 return SQLITE_CORRUPT_BKPT; 000148 } 000149 # define SQLITE_CORRUPT_PAGE(pMemPage) corruptPageError(__LINE__, pMemPage) 000150 #else 000151 # define SQLITE_CORRUPT_PAGE(pMemPage) SQLITE_CORRUPT_PGNO(pMemPage->pgno) 000152 #endif 000153 000154 #ifndef SQLITE_OMIT_SHARED_CACHE 000155 000156 #ifdef SQLITE_DEBUG 000157 /* 000158 **** This function is only used as part of an assert() statement. *** 000159 ** 000160 ** Check to see if pBtree holds the required locks to read or write to the 000161 ** table with root page iRoot. Return 1 if it does and 0 if not. 000162 ** 000163 ** For example, when writing to a table with root-page iRoot via 000164 ** Btree connection pBtree: 000165 ** 000166 ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) ); 000167 ** 000168 ** When writing to an index that resides in a sharable database, the 000169 ** caller should have first obtained a lock specifying the root page of 000170 ** the corresponding table. This makes things a bit more complicated, 000171 ** as this module treats each table as a separate structure. To determine 000172 ** the table corresponding to the index being written, this 000173 ** function has to search through the database schema. 000174 ** 000175 ** Instead of a lock on the table/index rooted at page iRoot, the caller may 000176 ** hold a write-lock on the schema table (root page 1). This is also 000177 ** acceptable. 000178 */ 000179 static int hasSharedCacheTableLock( 000180 Btree *pBtree, /* Handle that must hold lock */ 000181 Pgno iRoot, /* Root page of b-tree */ 000182 int isIndex, /* True if iRoot is the root of an index b-tree */ 000183 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */ 000184 ){ 000185 Schema *pSchema = (Schema *)pBtree->pBt->pSchema; 000186 Pgno iTab = 0; 000187 BtLock *pLock; 000188 000189 /* If this database is not shareable, or if the client is reading 000190 ** and has the read-uncommitted flag set, then no lock is required. 000191 ** Return true immediately. 000192 */ 000193 if( (pBtree->sharable==0) 000194 || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommit)) 000195 ){ 000196 return 1; 000197 } 000198 000199 /* If the client is reading or writing an index and the schema is 000200 ** not loaded, then it is too difficult to actually check to see if 000201 ** the correct locks are held. So do not bother - just return true. 000202 ** This case does not come up very often anyhow. 000203 */ 000204 if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){ 000205 return 1; 000206 } 000207 000208 /* Figure out the root-page that the lock should be held on. For table 000209 ** b-trees, this is just the root page of the b-tree being read or 000210 ** written. For index b-trees, it is the root page of the associated 000211 ** table. */ 000212 if( isIndex ){ 000213 HashElem *p; 000214 int bSeen = 0; 000215 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){ 000216 Index *pIdx = (Index *)sqliteHashData(p); 000217 if( pIdx->tnum==iRoot ){ 000218 if( bSeen ){ 000219 /* Two or more indexes share the same root page. There must 000220 ** be imposter tables. So just return true. The assert is not 000221 ** useful in that case. */ 000222 return 1; 000223 } 000224 iTab = pIdx->pTable->tnum; 000225 bSeen = 1; 000226 } 000227 } 000228 }else{ 000229 iTab = iRoot; 000230 } 000231 000232 /* Search for the required lock. Either a write-lock on root-page iTab, a 000233 ** write-lock on the schema table, or (if the client is reading) a 000234 ** read-lock on iTab will suffice. Return 1 if any of these are found. */ 000235 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){ 000236 if( pLock->pBtree==pBtree 000237 && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1)) 000238 && pLock->eLock>=eLockType 000239 ){ 000240 return 1; 000241 } 000242 } 000243 000244 /* Failed to find the required lock. */ 000245 return 0; 000246 } 000247 #endif /* SQLITE_DEBUG */ 000248 000249 #ifdef SQLITE_DEBUG 000250 /* 000251 **** This function may be used as part of assert() statements only. **** 000252 ** 000253 ** Return true if it would be illegal for pBtree to write into the 000254 ** table or index rooted at iRoot because other shared connections are 000255 ** simultaneously reading that same table or index. 000256 ** 000257 ** It is illegal for pBtree to write if some other Btree object that 000258 ** shares the same BtShared object is currently reading or writing 000259 ** the iRoot table. Except, if the other Btree object has the 000260 ** read-uncommitted flag set, then it is OK for the other object to 000261 ** have a read cursor. 000262 ** 000263 ** For example, before writing to any part of the table or index 000264 ** rooted at page iRoot, one should call: 000265 ** 000266 ** assert( !hasReadConflicts(pBtree, iRoot) ); 000267 */ 000268 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){ 000269 BtCursor *p; 000270 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 000271 if( p->pgnoRoot==iRoot 000272 && p->pBtree!=pBtree 000273 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommit) 000274 ){ 000275 return 1; 000276 } 000277 } 000278 return 0; 000279 } 000280 #endif /* #ifdef SQLITE_DEBUG */ 000281 000282 /* 000283 ** Query to see if Btree handle p may obtain a lock of type eLock 000284 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return 000285 ** SQLITE_OK if the lock may be obtained (by calling 000286 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not. 000287 */ 000288 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){ 000289 BtShared *pBt = p->pBt; 000290 BtLock *pIter; 000291 000292 assert( sqlite3BtreeHoldsMutex(p) ); 000293 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 000294 assert( p->db!=0 ); 000295 assert( !(p->db->flags&SQLITE_ReadUncommit)||eLock==WRITE_LOCK||iTab==1 ); 000296 000297 /* If requesting a write-lock, then the Btree must have an open write 000298 ** transaction on this file. And, obviously, for this to be so there 000299 ** must be an open write transaction on the file itself. 000300 */ 000301 assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) ); 000302 assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE ); 000303 000304 /* This routine is a no-op if the shared-cache is not enabled */ 000305 if( !p->sharable ){ 000306 return SQLITE_OK; 000307 } 000308 000309 /* If some other connection is holding an exclusive lock, the 000310 ** requested lock may not be obtained. 000311 */ 000312 if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){ 000313 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db); 000314 return SQLITE_LOCKED_SHAREDCACHE; 000315 } 000316 000317 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 000318 /* The condition (pIter->eLock!=eLock) in the following if(...) 000319 ** statement is a simplification of: 000320 ** 000321 ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK) 000322 ** 000323 ** since we know that if eLock==WRITE_LOCK, then no other connection 000324 ** may hold a WRITE_LOCK on any table in this file (since there can 000325 ** only be a single writer). 000326 */ 000327 assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK ); 000328 assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK); 000329 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){ 000330 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db); 000331 if( eLock==WRITE_LOCK ){ 000332 assert( p==pBt->pWriter ); 000333 pBt->btsFlags |= BTS_PENDING; 000334 } 000335 return SQLITE_LOCKED_SHAREDCACHE; 000336 } 000337 } 000338 return SQLITE_OK; 000339 } 000340 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 000341 000342 #ifndef SQLITE_OMIT_SHARED_CACHE 000343 /* 000344 ** Add a lock on the table with root-page iTable to the shared-btree used 000345 ** by Btree handle p. Parameter eLock must be either READ_LOCK or 000346 ** WRITE_LOCK. 000347 ** 000348 ** This function assumes the following: 000349 ** 000350 ** (a) The specified Btree object p is connected to a sharable 000351 ** database (one with the BtShared.sharable flag set), and 000352 ** 000353 ** (b) No other Btree objects hold a lock that conflicts 000354 ** with the requested lock (i.e. querySharedCacheTableLock() has 000355 ** already been called and returned SQLITE_OK). 000356 ** 000357 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM 000358 ** is returned if a malloc attempt fails. 000359 */ 000360 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){ 000361 BtShared *pBt = p->pBt; 000362 BtLock *pLock = 0; 000363 BtLock *pIter; 000364 000365 assert( sqlite3BtreeHoldsMutex(p) ); 000366 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 000367 assert( p->db!=0 ); 000368 000369 /* A connection with the read-uncommitted flag set will never try to 000370 ** obtain a read-lock using this function. The only read-lock obtained 000371 ** by a connection in read-uncommitted mode is on the sqlite_schema 000372 ** table, and that lock is obtained in BtreeBeginTrans(). */ 000373 assert( 0==(p->db->flags&SQLITE_ReadUncommit) || eLock==WRITE_LOCK ); 000374 000375 /* This function should only be called on a sharable b-tree after it 000376 ** has been determined that no other b-tree holds a conflicting lock. */ 000377 assert( p->sharable ); 000378 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) ); 000379 000380 /* First search the list for an existing lock on this table. */ 000381 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 000382 if( pIter->iTable==iTable && pIter->pBtree==p ){ 000383 pLock = pIter; 000384 break; 000385 } 000386 } 000387 000388 /* If the above search did not find a BtLock struct associating Btree p 000389 ** with table iTable, allocate one and link it into the list. 000390 */ 000391 if( !pLock ){ 000392 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock)); 000393 if( !pLock ){ 000394 return SQLITE_NOMEM_BKPT; 000395 } 000396 pLock->iTable = iTable; 000397 pLock->pBtree = p; 000398 pLock->pNext = pBt->pLock; 000399 pBt->pLock = pLock; 000400 } 000401 000402 /* Set the BtLock.eLock variable to the maximum of the current lock 000403 ** and the requested lock. This means if a write-lock was already held 000404 ** and a read-lock requested, we don't incorrectly downgrade the lock. 000405 */ 000406 assert( WRITE_LOCK>READ_LOCK ); 000407 if( eLock>pLock->eLock ){ 000408 pLock->eLock = eLock; 000409 } 000410 000411 return SQLITE_OK; 000412 } 000413 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 000414 000415 #ifndef SQLITE_OMIT_SHARED_CACHE 000416 /* 000417 ** Release all the table locks (locks obtained via calls to 000418 ** the setSharedCacheTableLock() procedure) held by Btree object p. 000419 ** 000420 ** This function assumes that Btree p has an open read or write 000421 ** transaction. If it does not, then the BTS_PENDING flag 000422 ** may be incorrectly cleared. 000423 */ 000424 static void clearAllSharedCacheTableLocks(Btree *p){ 000425 BtShared *pBt = p->pBt; 000426 BtLock **ppIter = &pBt->pLock; 000427 000428 assert( sqlite3BtreeHoldsMutex(p) ); 000429 assert( p->sharable || 0==*ppIter ); 000430 assert( p->inTrans>0 ); 000431 000432 while( *ppIter ){ 000433 BtLock *pLock = *ppIter; 000434 assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree ); 000435 assert( pLock->pBtree->inTrans>=pLock->eLock ); 000436 if( pLock->pBtree==p ){ 000437 *ppIter = pLock->pNext; 000438 assert( pLock->iTable!=1 || pLock==&p->lock ); 000439 if( pLock->iTable!=1 ){ 000440 sqlite3_free(pLock); 000441 } 000442 }else{ 000443 ppIter = &pLock->pNext; 000444 } 000445 } 000446 000447 assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter ); 000448 if( pBt->pWriter==p ){ 000449 pBt->pWriter = 0; 000450 pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING); 000451 }else if( pBt->nTransaction==2 ){ 000452 /* This function is called when Btree p is concluding its 000453 ** transaction. If there currently exists a writer, and p is not 000454 ** that writer, then the number of locks held by connections other 000455 ** than the writer must be about to drop to zero. In this case 000456 ** set the BTS_PENDING flag to 0. 000457 ** 000458 ** If there is not currently a writer, then BTS_PENDING must 000459 ** be zero already. So this next line is harmless in that case. 000460 */ 000461 pBt->btsFlags &= ~BTS_PENDING; 000462 } 000463 } 000464 000465 /* 000466 ** This function changes all write-locks held by Btree p into read-locks. 000467 */ 000468 static void downgradeAllSharedCacheTableLocks(Btree *p){ 000469 BtShared *pBt = p->pBt; 000470 if( pBt->pWriter==p ){ 000471 BtLock *pLock; 000472 pBt->pWriter = 0; 000473 pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING); 000474 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){ 000475 assert( pLock->eLock==READ_LOCK || pLock->pBtree==p ); 000476 pLock->eLock = READ_LOCK; 000477 } 000478 } 000479 } 000480 000481 #endif /* SQLITE_OMIT_SHARED_CACHE */ 000482 000483 static void releasePage(MemPage *pPage); /* Forward reference */ 000484 static void releasePageOne(MemPage *pPage); /* Forward reference */ 000485 static void releasePageNotNull(MemPage *pPage); /* Forward reference */ 000486 000487 /* 000488 ***** This routine is used inside of assert() only **** 000489 ** 000490 ** Verify that the cursor holds the mutex on its BtShared 000491 */ 000492 #ifdef SQLITE_DEBUG 000493 static int cursorHoldsMutex(BtCursor *p){ 000494 return sqlite3_mutex_held(p->pBt->mutex); 000495 } 000496 000497 /* Verify that the cursor and the BtShared agree about what is the current 000498 ** database connetion. This is important in shared-cache mode. If the database 000499 ** connection pointers get out-of-sync, it is possible for routines like 000500 ** btreeInitPage() to reference an stale connection pointer that references a 000501 ** a connection that has already closed. This routine is used inside assert() 000502 ** statements only and for the purpose of double-checking that the btree code 000503 ** does keep the database connection pointers up-to-date. 000504 */ 000505 static int cursorOwnsBtShared(BtCursor *p){ 000506 assert( cursorHoldsMutex(p) ); 000507 return (p->pBtree->db==p->pBt->db); 000508 } 000509 #endif 000510 000511 /* 000512 ** Invalidate the overflow cache of the cursor passed as the first argument. 000513 ** on the shared btree structure pBt. 000514 */ 000515 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl) 000516 000517 /* 000518 ** Invalidate the overflow page-list cache for all cursors opened 000519 ** on the shared btree structure pBt. 000520 */ 000521 static void invalidateAllOverflowCache(BtShared *pBt){ 000522 BtCursor *p; 000523 assert( sqlite3_mutex_held(pBt->mutex) ); 000524 for(p=pBt->pCursor; p; p=p->pNext){ 000525 invalidateOverflowCache(p); 000526 } 000527 } 000528 000529 #ifndef SQLITE_OMIT_INCRBLOB 000530 /* 000531 ** This function is called before modifying the contents of a table 000532 ** to invalidate any incrblob cursors that are open on the 000533 ** row or one of the rows being modified. 000534 ** 000535 ** If argument isClearTable is true, then the entire contents of the 000536 ** table is about to be deleted. In this case invalidate all incrblob 000537 ** cursors open on any row within the table with root-page pgnoRoot. 000538 ** 000539 ** Otherwise, if argument isClearTable is false, then the row with 000540 ** rowid iRow is being replaced or deleted. In this case invalidate 000541 ** only those incrblob cursors open on that specific row. 000542 */ 000543 static void invalidateIncrblobCursors( 000544 Btree *pBtree, /* The database file to check */ 000545 Pgno pgnoRoot, /* The table that might be changing */ 000546 i64 iRow, /* The rowid that might be changing */ 000547 int isClearTable /* True if all rows are being deleted */ 000548 ){ 000549 BtCursor *p; 000550 assert( pBtree->hasIncrblobCur ); 000551 assert( sqlite3BtreeHoldsMutex(pBtree) ); 000552 pBtree->hasIncrblobCur = 0; 000553 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 000554 if( (p->curFlags & BTCF_Incrblob)!=0 ){ 000555 pBtree->hasIncrblobCur = 1; 000556 if( p->pgnoRoot==pgnoRoot && (isClearTable || p->info.nKey==iRow) ){ 000557 p->eState = CURSOR_INVALID; 000558 } 000559 } 000560 } 000561 } 000562 000563 #else 000564 /* Stub function when INCRBLOB is omitted */ 000565 #define invalidateIncrblobCursors(w,x,y,z) 000566 #endif /* SQLITE_OMIT_INCRBLOB */ 000567 000568 /* 000569 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called 000570 ** when a page that previously contained data becomes a free-list leaf 000571 ** page. 000572 ** 000573 ** The BtShared.pHasContent bitvec exists to work around an obscure 000574 ** bug caused by the interaction of two useful IO optimizations surrounding 000575 ** free-list leaf pages: 000576 ** 000577 ** 1) When all data is deleted from a page and the page becomes 000578 ** a free-list leaf page, the page is not written to the database 000579 ** (as free-list leaf pages contain no meaningful data). Sometimes 000580 ** such a page is not even journalled (as it will not be modified, 000581 ** why bother journalling it?). 000582 ** 000583 ** 2) When a free-list leaf page is reused, its content is not read 000584 ** from the database or written to the journal file (why should it 000585 ** be, if it is not at all meaningful?). 000586 ** 000587 ** By themselves, these optimizations work fine and provide a handy 000588 ** performance boost to bulk delete or insert operations. However, if 000589 ** a page is moved to the free-list and then reused within the same 000590 ** transaction, a problem comes up. If the page is not journalled when 000591 ** it is moved to the free-list and it is also not journalled when it 000592 ** is extracted from the free-list and reused, then the original data 000593 ** may be lost. In the event of a rollback, it may not be possible 000594 ** to restore the database to its original configuration. 000595 ** 000596 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is 000597 ** moved to become a free-list leaf page, the corresponding bit is 000598 ** set in the bitvec. Whenever a leaf page is extracted from the free-list, 000599 ** optimization 2 above is omitted if the corresponding bit is already 000600 ** set in BtShared.pHasContent. The contents of the bitvec are cleared 000601 ** at the end of every transaction. 000602 */ 000603 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){ 000604 int rc = SQLITE_OK; 000605 if( !pBt->pHasContent ){ 000606 assert( pgno<=pBt->nPage ); 000607 pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage); 000608 if( !pBt->pHasContent ){ 000609 rc = SQLITE_NOMEM_BKPT; 000610 } 000611 } 000612 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){ 000613 rc = sqlite3BitvecSet(pBt->pHasContent, pgno); 000614 } 000615 return rc; 000616 } 000617 000618 /* 000619 ** Query the BtShared.pHasContent vector. 000620 ** 000621 ** This function is called when a free-list leaf page is removed from the 000622 ** free-list for reuse. It returns false if it is safe to retrieve the 000623 ** page from the pager layer with the 'no-content' flag set. True otherwise. 000624 */ 000625 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){ 000626 Bitvec *p = pBt->pHasContent; 000627 return p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTestNotNull(p, pgno)); 000628 } 000629 000630 /* 000631 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be 000632 ** invoked at the conclusion of each write-transaction. 000633 */ 000634 static void btreeClearHasContent(BtShared *pBt){ 000635 sqlite3BitvecDestroy(pBt->pHasContent); 000636 pBt->pHasContent = 0; 000637 } 000638 000639 /* 000640 ** Release all of the apPage[] pages for a cursor. 000641 */ 000642 static void btreeReleaseAllCursorPages(BtCursor *pCur){ 000643 int i; 000644 if( pCur->iPage>=0 ){ 000645 for(i=0; i<pCur->iPage; i++){ 000646 releasePageNotNull(pCur->apPage[i]); 000647 } 000648 releasePageNotNull(pCur->pPage); 000649 pCur->iPage = -1; 000650 } 000651 } 000652 000653 /* 000654 ** The cursor passed as the only argument must point to a valid entry 000655 ** when this function is called (i.e. have eState==CURSOR_VALID). This 000656 ** function saves the current cursor key in variables pCur->nKey and 000657 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error 000658 ** code otherwise. 000659 ** 000660 ** If the cursor is open on an intkey table, then the integer key 000661 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to 000662 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is 000663 ** set to point to a malloced buffer pCur->nKey bytes in size containing 000664 ** the key. 000665 */ 000666 static int saveCursorKey(BtCursor *pCur){ 000667 int rc = SQLITE_OK; 000668 assert( CURSOR_VALID==pCur->eState ); 000669 assert( 0==pCur->pKey ); 000670 assert( cursorHoldsMutex(pCur) ); 000671 000672 if( pCur->curIntKey ){ 000673 /* Only the rowid is required for a table btree */ 000674 pCur->nKey = sqlite3BtreeIntegerKey(pCur); 000675 }else{ 000676 /* For an index btree, save the complete key content. It is possible 000677 ** that the current key is corrupt. In that case, it is possible that 000678 ** the sqlite3VdbeRecordUnpack() function may overread the buffer by 000679 ** up to the size of 1 varint plus 1 8-byte value when the cursor 000680 ** position is restored. Hence the 17 bytes of padding allocated 000681 ** below. */ 000682 void *pKey; 000683 pCur->nKey = sqlite3BtreePayloadSize(pCur); 000684 pKey = sqlite3Malloc( pCur->nKey + 9 + 8 ); 000685 if( pKey ){ 000686 rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey); 000687 if( rc==SQLITE_OK ){ 000688 memset(((u8*)pKey)+pCur->nKey, 0, 9+8); 000689 pCur->pKey = pKey; 000690 }else{ 000691 sqlite3_free(pKey); 000692 } 000693 }else{ 000694 rc = SQLITE_NOMEM_BKPT; 000695 } 000696 } 000697 assert( !pCur->curIntKey || !pCur->pKey ); 000698 return rc; 000699 } 000700 000701 /* 000702 ** Save the current cursor position in the variables BtCursor.nKey 000703 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. 000704 ** 000705 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID) 000706 ** prior to calling this routine. 000707 */ 000708 static int saveCursorPosition(BtCursor *pCur){ 000709 int rc; 000710 000711 assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState ); 000712 assert( 0==pCur->pKey ); 000713 assert( cursorHoldsMutex(pCur) ); 000714 000715 if( pCur->curFlags & BTCF_Pinned ){ 000716 return SQLITE_CONSTRAINT_PINNED; 000717 } 000718 if( pCur->eState==CURSOR_SKIPNEXT ){ 000719 pCur->eState = CURSOR_VALID; 000720 }else{ 000721 pCur->skipNext = 0; 000722 } 000723 000724 rc = saveCursorKey(pCur); 000725 if( rc==SQLITE_OK ){ 000726 btreeReleaseAllCursorPages(pCur); 000727 pCur->eState = CURSOR_REQUIRESEEK; 000728 } 000729 000730 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast); 000731 return rc; 000732 } 000733 000734 /* Forward reference */ 000735 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*); 000736 000737 /* 000738 ** Save the positions of all cursors (except pExcept) that are open on 000739 ** the table with root-page iRoot. "Saving the cursor position" means that 000740 ** the location in the btree is remembered in such a way that it can be 000741 ** moved back to the same spot after the btree has been modified. This 000742 ** routine is called just before cursor pExcept is used to modify the 000743 ** table, for example in BtreeDelete() or BtreeInsert(). 000744 ** 000745 ** If there are two or more cursors on the same btree, then all such 000746 ** cursors should have their BTCF_Multiple flag set. The btreeCursor() 000747 ** routine enforces that rule. This routine only needs to be called in 000748 ** the uncommon case when pExpect has the BTCF_Multiple flag set. 000749 ** 000750 ** If pExpect!=NULL and if no other cursors are found on the same root-page, 000751 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another 000752 ** pointless call to this routine. 000753 ** 000754 ** Implementation note: This routine merely checks to see if any cursors 000755 ** need to be saved. It calls out to saveCursorsOnList() in the (unusual) 000756 ** event that cursors are in need to being saved. 000757 */ 000758 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ 000759 BtCursor *p; 000760 assert( sqlite3_mutex_held(pBt->mutex) ); 000761 assert( pExcept==0 || pExcept->pBt==pBt ); 000762 for(p=pBt->pCursor; p; p=p->pNext){ 000763 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break; 000764 } 000765 if( p ) return saveCursorsOnList(p, iRoot, pExcept); 000766 if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple; 000767 return SQLITE_OK; 000768 } 000769 000770 /* This helper routine to saveAllCursors does the actual work of saving 000771 ** the cursors if and when a cursor is found that actually requires saving. 000772 ** The common case is that no cursors need to be saved, so this routine is 000773 ** broken out from its caller to avoid unnecessary stack pointer movement. 000774 */ 000775 static int SQLITE_NOINLINE saveCursorsOnList( 000776 BtCursor *p, /* The first cursor that needs saving */ 000777 Pgno iRoot, /* Only save cursor with this iRoot. Save all if zero */ 000778 BtCursor *pExcept /* Do not save this cursor */ 000779 ){ 000780 do{ 000781 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){ 000782 if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){ 000783 int rc = saveCursorPosition(p); 000784 if( SQLITE_OK!=rc ){ 000785 return rc; 000786 } 000787 }else{ 000788 testcase( p->iPage>=0 ); 000789 btreeReleaseAllCursorPages(p); 000790 } 000791 } 000792 p = p->pNext; 000793 }while( p ); 000794 return SQLITE_OK; 000795 } 000796 000797 /* 000798 ** Clear the current cursor position. 000799 */ 000800 void sqlite3BtreeClearCursor(BtCursor *pCur){ 000801 assert( cursorHoldsMutex(pCur) ); 000802 sqlite3_free(pCur->pKey); 000803 pCur->pKey = 0; 000804 pCur->eState = CURSOR_INVALID; 000805 } 000806 000807 /* 000808 ** In this version of BtreeMoveto, pKey is a packed index record 000809 ** such as is generated by the OP_MakeRecord opcode. Unpack the 000810 ** record and then call sqlite3BtreeIndexMoveto() to do the work. 000811 */ 000812 static int btreeMoveto( 000813 BtCursor *pCur, /* Cursor open on the btree to be searched */ 000814 const void *pKey, /* Packed key if the btree is an index */ 000815 i64 nKey, /* Integer key for tables. Size of pKey for indices */ 000816 int bias, /* Bias search to the high end */ 000817 int *pRes /* Write search results here */ 000818 ){ 000819 int rc; /* Status code */ 000820 UnpackedRecord *pIdxKey; /* Unpacked index key */ 000821 000822 if( pKey ){ 000823 KeyInfo *pKeyInfo = pCur->pKeyInfo; 000824 assert( nKey==(i64)(int)nKey ); 000825 pIdxKey = sqlite3VdbeAllocUnpackedRecord(pKeyInfo); 000826 if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT; 000827 sqlite3VdbeRecordUnpack(pKeyInfo, (int)nKey, pKey, pIdxKey); 000828 if( pIdxKey->nField==0 || pIdxKey->nField>pKeyInfo->nAllField ){ 000829 rc = SQLITE_CORRUPT_BKPT; 000830 }else{ 000831 rc = sqlite3BtreeIndexMoveto(pCur, pIdxKey, pRes); 000832 } 000833 sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey); 000834 }else{ 000835 pIdxKey = 0; 000836 rc = sqlite3BtreeTableMoveto(pCur, nKey, bias, pRes); 000837 } 000838 return rc; 000839 } 000840 000841 /* 000842 ** Restore the cursor to the position it was in (or as close to as possible) 000843 ** when saveCursorPosition() was called. Note that this call deletes the 000844 ** saved position info stored by saveCursorPosition(), so there can be 000845 ** at most one effective restoreCursorPosition() call after each 000846 ** saveCursorPosition(). 000847 */ 000848 static int btreeRestoreCursorPosition(BtCursor *pCur){ 000849 int rc; 000850 int skipNext = 0; 000851 assert( cursorOwnsBtShared(pCur) ); 000852 assert( pCur->eState>=CURSOR_REQUIRESEEK ); 000853 if( pCur->eState==CURSOR_FAULT ){ 000854 return pCur->skipNext; 000855 } 000856 pCur->eState = CURSOR_INVALID; 000857 if( sqlite3FaultSim(410) ){ 000858 rc = SQLITE_IOERR; 000859 }else{ 000860 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext); 000861 } 000862 if( rc==SQLITE_OK ){ 000863 sqlite3_free(pCur->pKey); 000864 pCur->pKey = 0; 000865 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID ); 000866 if( skipNext ) pCur->skipNext = skipNext; 000867 if( pCur->skipNext && pCur->eState==CURSOR_VALID ){ 000868 pCur->eState = CURSOR_SKIPNEXT; 000869 } 000870 } 000871 return rc; 000872 } 000873 000874 #define restoreCursorPosition(p) \ 000875 (p->eState>=CURSOR_REQUIRESEEK ? \ 000876 btreeRestoreCursorPosition(p) : \ 000877 SQLITE_OK) 000878 000879 /* 000880 ** Determine whether or not a cursor has moved from the position where 000881 ** it was last placed, or has been invalidated for any other reason. 000882 ** Cursors can move when the row they are pointing at is deleted out 000883 ** from under them, for example. Cursor might also move if a btree 000884 ** is rebalanced. 000885 ** 000886 ** Calling this routine with a NULL cursor pointer returns false. 000887 ** 000888 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor 000889 ** back to where it ought to be if this routine returns true. 000890 */ 000891 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){ 000892 assert( EIGHT_BYTE_ALIGNMENT(pCur) 000893 || pCur==sqlite3BtreeFakeValidCursor() ); 000894 assert( offsetof(BtCursor, eState)==0 ); 000895 assert( sizeof(pCur->eState)==1 ); 000896 return CURSOR_VALID != *(u8*)pCur; 000897 } 000898 000899 /* 000900 ** Return a pointer to a fake BtCursor object that will always answer 000901 ** false to the sqlite3BtreeCursorHasMoved() routine above. The fake 000902 ** cursor returned must not be used with any other Btree interface. 000903 */ 000904 BtCursor *sqlite3BtreeFakeValidCursor(void){ 000905 static u8 fakeCursor = CURSOR_VALID; 000906 assert( offsetof(BtCursor, eState)==0 ); 000907 return (BtCursor*)&fakeCursor; 000908 } 000909 000910 /* 000911 ** This routine restores a cursor back to its original position after it 000912 ** has been moved by some outside activity (such as a btree rebalance or 000913 ** a row having been deleted out from under the cursor). 000914 ** 000915 ** On success, the *pDifferentRow parameter is false if the cursor is left 000916 ** pointing at exactly the same row. *pDifferntRow is the row the cursor 000917 ** was pointing to has been deleted, forcing the cursor to point to some 000918 ** nearby row. 000919 ** 000920 ** This routine should only be called for a cursor that just returned 000921 ** TRUE from sqlite3BtreeCursorHasMoved(). 000922 */ 000923 int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){ 000924 int rc; 000925 000926 assert( pCur!=0 ); 000927 assert( pCur->eState!=CURSOR_VALID ); 000928 rc = restoreCursorPosition(pCur); 000929 if( rc ){ 000930 *pDifferentRow = 1; 000931 return rc; 000932 } 000933 if( pCur->eState!=CURSOR_VALID ){ 000934 *pDifferentRow = 1; 000935 }else{ 000936 *pDifferentRow = 0; 000937 } 000938 return SQLITE_OK; 000939 } 000940 000941 #ifdef SQLITE_ENABLE_CURSOR_HINTS 000942 /* 000943 ** Provide hints to the cursor. The particular hint given (and the type 000944 ** and number of the varargs parameters) is determined by the eHintType 000945 ** parameter. See the definitions of the BTREE_HINT_* macros for details. 000946 */ 000947 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){ 000948 /* Used only by system that substitute their own storage engine */ 000949 #ifdef SQLITE_DEBUG 000950 if( ALWAYS(eHintType==BTREE_HINT_RANGE) ){ 000951 va_list ap; 000952 Expr *pExpr; 000953 Walker w; 000954 memset(&w, 0, sizeof(w)); 000955 w.xExprCallback = sqlite3CursorRangeHintExprCheck; 000956 va_start(ap, eHintType); 000957 pExpr = va_arg(ap, Expr*); 000958 w.u.aMem = va_arg(ap, Mem*); 000959 va_end(ap); 000960 assert( pExpr!=0 ); 000961 assert( w.u.aMem!=0 ); 000962 sqlite3WalkExpr(&w, pExpr); 000963 } 000964 #endif /* SQLITE_DEBUG */ 000965 } 000966 #endif /* SQLITE_ENABLE_CURSOR_HINTS */ 000967 000968 000969 /* 000970 ** Provide flag hints to the cursor. 000971 */ 000972 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){ 000973 assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 ); 000974 pCur->hints = x; 000975 } 000976 000977 000978 #ifndef SQLITE_OMIT_AUTOVACUUM 000979 /* 000980 ** Given a page number of a regular database page, return the page 000981 ** number for the pointer-map page that contains the entry for the 000982 ** input page number. 000983 ** 000984 ** Return 0 (not a valid page) for pgno==1 since there is 000985 ** no pointer map associated with page 1. The integrity_check logic 000986 ** requires that ptrmapPageno(*,1)!=1. 000987 */ 000988 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ 000989 int nPagesPerMapPage; 000990 Pgno iPtrMap, ret; 000991 assert( sqlite3_mutex_held(pBt->mutex) ); 000992 if( pgno<2 ) return 0; 000993 nPagesPerMapPage = (pBt->usableSize/5)+1; 000994 iPtrMap = (pgno-2)/nPagesPerMapPage; 000995 ret = (iPtrMap*nPagesPerMapPage) + 2; 000996 if( ret==PENDING_BYTE_PAGE(pBt) ){ 000997 ret++; 000998 } 000999 return ret; 001000 } 001001 001002 /* 001003 ** Write an entry into the pointer map. 001004 ** 001005 ** This routine updates the pointer map entry for page number 'key' 001006 ** so that it maps to type 'eType' and parent page number 'pgno'. 001007 ** 001008 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is 001009 ** a no-op. If an error occurs, the appropriate error code is written 001010 ** into *pRC. 001011 */ 001012 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){ 001013 DbPage *pDbPage; /* The pointer map page */ 001014 u8 *pPtrmap; /* The pointer map data */ 001015 Pgno iPtrmap; /* The pointer map page number */ 001016 int offset; /* Offset in pointer map page */ 001017 int rc; /* Return code from subfunctions */ 001018 001019 if( *pRC ) return; 001020 001021 assert( sqlite3_mutex_held(pBt->mutex) ); 001022 /* The super-journal page number must never be used as a pointer map page */ 001023 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) ); 001024 001025 assert( pBt->autoVacuum ); 001026 if( key==0 ){ 001027 *pRC = SQLITE_CORRUPT_BKPT; 001028 return; 001029 } 001030 iPtrmap = PTRMAP_PAGENO(pBt, key); 001031 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0); 001032 if( rc!=SQLITE_OK ){ 001033 *pRC = rc; 001034 return; 001035 } 001036 if( ((char*)sqlite3PagerGetExtra(pDbPage))[0]!=0 ){ 001037 /* The first byte of the extra data is the MemPage.isInit byte. 001038 ** If that byte is set, it means this page is also being used 001039 ** as a btree page. */ 001040 *pRC = SQLITE_CORRUPT_BKPT; 001041 goto ptrmap_exit; 001042 } 001043 offset = PTRMAP_PTROFFSET(iPtrmap, key); 001044 if( offset<0 ){ 001045 *pRC = SQLITE_CORRUPT_BKPT; 001046 goto ptrmap_exit; 001047 } 001048 assert( offset <= (int)pBt->usableSize-5 ); 001049 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 001050 001051 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){ 001052 TRACE(("PTRMAP_UPDATE: %u->(%u,%u)\n", key, eType, parent)); 001053 *pRC= rc = sqlite3PagerWrite(pDbPage); 001054 if( rc==SQLITE_OK ){ 001055 pPtrmap[offset] = eType; 001056 put4byte(&pPtrmap[offset+1], parent); 001057 } 001058 } 001059 001060 ptrmap_exit: 001061 sqlite3PagerUnref(pDbPage); 001062 } 001063 001064 /* 001065 ** Read an entry from the pointer map. 001066 ** 001067 ** This routine retrieves the pointer map entry for page 'key', writing 001068 ** the type and parent page number to *pEType and *pPgno respectively. 001069 ** An error code is returned if something goes wrong, otherwise SQLITE_OK. 001070 */ 001071 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ 001072 DbPage *pDbPage; /* The pointer map page */ 001073 int iPtrmap; /* Pointer map page index */ 001074 u8 *pPtrmap; /* Pointer map page data */ 001075 int offset; /* Offset of entry in pointer map */ 001076 int rc; 001077 001078 assert( sqlite3_mutex_held(pBt->mutex) ); 001079 001080 iPtrmap = PTRMAP_PAGENO(pBt, key); 001081 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0); 001082 if( rc!=0 ){ 001083 return rc; 001084 } 001085 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 001086 001087 offset = PTRMAP_PTROFFSET(iPtrmap, key); 001088 if( offset<0 ){ 001089 sqlite3PagerUnref(pDbPage); 001090 return SQLITE_CORRUPT_BKPT; 001091 } 001092 assert( offset <= (int)pBt->usableSize-5 ); 001093 assert( pEType!=0 ); 001094 *pEType = pPtrmap[offset]; 001095 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]); 001096 001097 sqlite3PagerUnref(pDbPage); 001098 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_PGNO(iPtrmap); 001099 return SQLITE_OK; 001100 } 001101 001102 #else /* if defined SQLITE_OMIT_AUTOVACUUM */ 001103 #define ptrmapPut(w,x,y,z,rc) 001104 #define ptrmapGet(w,x,y,z) SQLITE_OK 001105 #define ptrmapPutOvflPtr(x, y, z, rc) 001106 #endif 001107 001108 /* 001109 ** Given a btree page and a cell index (0 means the first cell on 001110 ** the page, 1 means the second cell, and so forth) return a pointer 001111 ** to the cell content. 001112 ** 001113 ** findCellPastPtr() does the same except it skips past the initial 001114 ** 4-byte child pointer found on interior pages, if there is one. 001115 ** 001116 ** This routine works only for pages that do not contain overflow cells. 001117 */ 001118 #define findCell(P,I) \ 001119 ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)]))) 001120 #define findCellPastPtr(P,I) \ 001121 ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)]))) 001122 001123 001124 /* 001125 ** This is common tail processing for btreeParseCellPtr() and 001126 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely 001127 ** on a single B-tree page. Make necessary adjustments to the CellInfo 001128 ** structure. 001129 */ 001130 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow( 001131 MemPage *pPage, /* Page containing the cell */ 001132 u8 *pCell, /* Pointer to the cell text. */ 001133 CellInfo *pInfo /* Fill in this structure */ 001134 ){ 001135 /* If the payload will not fit completely on the local page, we have 001136 ** to decide how much to store locally and how much to spill onto 001137 ** overflow pages. The strategy is to minimize the amount of unused 001138 ** space on overflow pages while keeping the amount of local storage 001139 ** in between minLocal and maxLocal. 001140 ** 001141 ** Warning: changing the way overflow payload is distributed in any 001142 ** way will result in an incompatible file format. 001143 */ 001144 int minLocal; /* Minimum amount of payload held locally */ 001145 int maxLocal; /* Maximum amount of payload held locally */ 001146 int surplus; /* Overflow payload available for local storage */ 001147 001148 minLocal = pPage->minLocal; 001149 maxLocal = pPage->maxLocal; 001150 surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4); 001151 testcase( surplus==maxLocal ); 001152 testcase( surplus==maxLocal+1 ); 001153 if( surplus <= maxLocal ){ 001154 pInfo->nLocal = (u16)surplus; 001155 }else{ 001156 pInfo->nLocal = (u16)minLocal; 001157 } 001158 pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4; 001159 } 001160 001161 /* 001162 ** Given a record with nPayload bytes of payload stored within btree 001163 ** page pPage, return the number of bytes of payload stored locally. 001164 */ 001165 static int btreePayloadToLocal(MemPage *pPage, i64 nPayload){ 001166 int maxLocal; /* Maximum amount of payload held locally */ 001167 maxLocal = pPage->maxLocal; 001168 if( nPayload<=maxLocal ){ 001169 return nPayload; 001170 }else{ 001171 int minLocal; /* Minimum amount of payload held locally */ 001172 int surplus; /* Overflow payload available for local storage */ 001173 minLocal = pPage->minLocal; 001174 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize-4); 001175 return ( surplus <= maxLocal ) ? surplus : minLocal; 001176 } 001177 } 001178 001179 /* 001180 ** The following routines are implementations of the MemPage.xParseCell() 001181 ** method. 001182 ** 001183 ** Parse a cell content block and fill in the CellInfo structure. 001184 ** 001185 ** btreeParseCellPtr() => table btree leaf nodes 001186 ** btreeParseCellNoPayload() => table btree internal nodes 001187 ** btreeParseCellPtrIndex() => index btree nodes 001188 ** 001189 ** There is also a wrapper function btreeParseCell() that works for 001190 ** all MemPage types and that references the cell by index rather than 001191 ** by pointer. 001192 */ 001193 static void btreeParseCellPtrNoPayload( 001194 MemPage *pPage, /* Page containing the cell */ 001195 u8 *pCell, /* Pointer to the cell text. */ 001196 CellInfo *pInfo /* Fill in this structure */ 001197 ){ 001198 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001199 assert( pPage->leaf==0 ); 001200 assert( pPage->childPtrSize==4 ); 001201 #ifndef SQLITE_DEBUG 001202 UNUSED_PARAMETER(pPage); 001203 #endif 001204 pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey); 001205 pInfo->nPayload = 0; 001206 pInfo->nLocal = 0; 001207 pInfo->pPayload = 0; 001208 return; 001209 } 001210 static void btreeParseCellPtr( 001211 MemPage *pPage, /* Page containing the cell */ 001212 u8 *pCell, /* Pointer to the cell text. */ 001213 CellInfo *pInfo /* Fill in this structure */ 001214 ){ 001215 u8 *pIter; /* For scanning through pCell */ 001216 u32 nPayload; /* Number of bytes of cell payload */ 001217 u64 iKey; /* Extracted Key value */ 001218 001219 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001220 assert( pPage->leaf==0 || pPage->leaf==1 ); 001221 assert( pPage->intKeyLeaf ); 001222 assert( pPage->childPtrSize==0 ); 001223 pIter = pCell; 001224 001225 /* The next block of code is equivalent to: 001226 ** 001227 ** pIter += getVarint32(pIter, nPayload); 001228 ** 001229 ** The code is inlined to avoid a function call. 001230 */ 001231 nPayload = *pIter; 001232 if( nPayload>=0x80 ){ 001233 u8 *pEnd = &pIter[8]; 001234 nPayload &= 0x7f; 001235 do{ 001236 nPayload = (nPayload<<7) | (*++pIter & 0x7f); 001237 }while( (*pIter)>=0x80 && pIter<pEnd ); 001238 } 001239 pIter++; 001240 001241 /* The next block of code is equivalent to: 001242 ** 001243 ** pIter += getVarint(pIter, (u64*)&pInfo->nKey); 001244 ** 001245 ** The code is inlined and the loop is unrolled for performance. 001246 ** This routine is a high-runner. 001247 */ 001248 iKey = *pIter; 001249 if( iKey>=0x80 ){ 001250 u8 x; 001251 iKey = (iKey<<7) ^ (x = *++pIter); 001252 if( x>=0x80 ){ 001253 iKey = (iKey<<7) ^ (x = *++pIter); 001254 if( x>=0x80 ){ 001255 iKey = (iKey<<7) ^ 0x10204000 ^ (x = *++pIter); 001256 if( x>=0x80 ){ 001257 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); 001258 if( x>=0x80 ){ 001259 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); 001260 if( x>=0x80 ){ 001261 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); 001262 if( x>=0x80 ){ 001263 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); 001264 if( x>=0x80 ){ 001265 iKey = (iKey<<8) ^ 0x8000 ^ (*++pIter); 001266 } 001267 } 001268 } 001269 } 001270 } 001271 }else{ 001272 iKey ^= 0x204000; 001273 } 001274 }else{ 001275 iKey ^= 0x4000; 001276 } 001277 } 001278 pIter++; 001279 001280 pInfo->nKey = *(i64*)&iKey; 001281 pInfo->nPayload = nPayload; 001282 pInfo->pPayload = pIter; 001283 testcase( nPayload==pPage->maxLocal ); 001284 testcase( nPayload==(u32)pPage->maxLocal+1 ); 001285 if( nPayload<=pPage->maxLocal ){ 001286 /* This is the (easy) common case where the entire payload fits 001287 ** on the local page. No overflow is required. 001288 */ 001289 pInfo->nSize = nPayload + (u16)(pIter - pCell); 001290 if( pInfo->nSize<4 ) pInfo->nSize = 4; 001291 pInfo->nLocal = (u16)nPayload; 001292 }else{ 001293 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo); 001294 } 001295 } 001296 static void btreeParseCellPtrIndex( 001297 MemPage *pPage, /* Page containing the cell */ 001298 u8 *pCell, /* Pointer to the cell text. */ 001299 CellInfo *pInfo /* Fill in this structure */ 001300 ){ 001301 u8 *pIter; /* For scanning through pCell */ 001302 u32 nPayload; /* Number of bytes of cell payload */ 001303 001304 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001305 assert( pPage->leaf==0 || pPage->leaf==1 ); 001306 assert( pPage->intKeyLeaf==0 ); 001307 pIter = pCell + pPage->childPtrSize; 001308 nPayload = *pIter; 001309 if( nPayload>=0x80 ){ 001310 u8 *pEnd = &pIter[8]; 001311 nPayload &= 0x7f; 001312 do{ 001313 nPayload = (nPayload<<7) | (*++pIter & 0x7f); 001314 }while( *(pIter)>=0x80 && pIter<pEnd ); 001315 } 001316 pIter++; 001317 pInfo->nKey = nPayload; 001318 pInfo->nPayload = nPayload; 001319 pInfo->pPayload = pIter; 001320 testcase( nPayload==pPage->maxLocal ); 001321 testcase( nPayload==(u32)pPage->maxLocal+1 ); 001322 if( nPayload<=pPage->maxLocal ){ 001323 /* This is the (easy) common case where the entire payload fits 001324 ** on the local page. No overflow is required. 001325 */ 001326 pInfo->nSize = nPayload + (u16)(pIter - pCell); 001327 if( pInfo->nSize<4 ) pInfo->nSize = 4; 001328 pInfo->nLocal = (u16)nPayload; 001329 }else{ 001330 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo); 001331 } 001332 } 001333 static void btreeParseCell( 001334 MemPage *pPage, /* Page containing the cell */ 001335 int iCell, /* The cell index. First cell is 0 */ 001336 CellInfo *pInfo /* Fill in this structure */ 001337 ){ 001338 pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo); 001339 } 001340 001341 /* 001342 ** The following routines are implementations of the MemPage.xCellSize 001343 ** method. 001344 ** 001345 ** Compute the total number of bytes that a Cell needs in the cell 001346 ** data area of the btree-page. The return number includes the cell 001347 ** data header and the local payload, but not any overflow page or 001348 ** the space used by the cell pointer. 001349 ** 001350 ** cellSizePtrNoPayload() => table internal nodes 001351 ** cellSizePtrTableLeaf() => table leaf nodes 001352 ** cellSizePtr() => index internal nodes 001353 ** cellSizeIdxLeaf() => index leaf nodes 001354 */ 001355 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ 001356 u8 *pIter = pCell + 4; /* For looping over bytes of pCell */ 001357 u8 *pEnd; /* End mark for a varint */ 001358 u32 nSize; /* Size value to return */ 001359 001360 #ifdef SQLITE_DEBUG 001361 /* The value returned by this function should always be the same as 001362 ** the (CellInfo.nSize) value found by doing a full parse of the 001363 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001364 ** this function verifies that this invariant is not violated. */ 001365 CellInfo debuginfo; 001366 pPage->xParseCell(pPage, pCell, &debuginfo); 001367 #endif 001368 001369 assert( pPage->childPtrSize==4 ); 001370 nSize = *pIter; 001371 if( nSize>=0x80 ){ 001372 pEnd = &pIter[8]; 001373 nSize &= 0x7f; 001374 do{ 001375 nSize = (nSize<<7) | (*++pIter & 0x7f); 001376 }while( *(pIter)>=0x80 && pIter<pEnd ); 001377 } 001378 pIter++; 001379 testcase( nSize==pPage->maxLocal ); 001380 testcase( nSize==(u32)pPage->maxLocal+1 ); 001381 if( nSize<=pPage->maxLocal ){ 001382 nSize += (u32)(pIter - pCell); 001383 assert( nSize>4 ); 001384 }else{ 001385 int minLocal = pPage->minLocal; 001386 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 001387 testcase( nSize==pPage->maxLocal ); 001388 testcase( nSize==(u32)pPage->maxLocal+1 ); 001389 if( nSize>pPage->maxLocal ){ 001390 nSize = minLocal; 001391 } 001392 nSize += 4 + (u16)(pIter - pCell); 001393 } 001394 assert( nSize==debuginfo.nSize || CORRUPT_DB ); 001395 return (u16)nSize; 001396 } 001397 static u16 cellSizePtrIdxLeaf(MemPage *pPage, u8 *pCell){ 001398 u8 *pIter = pCell; /* For looping over bytes of pCell */ 001399 u8 *pEnd; /* End mark for a varint */ 001400 u32 nSize; /* Size value to return */ 001401 001402 #ifdef SQLITE_DEBUG 001403 /* The value returned by this function should always be the same as 001404 ** the (CellInfo.nSize) value found by doing a full parse of the 001405 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001406 ** this function verifies that this invariant is not violated. */ 001407 CellInfo debuginfo; 001408 pPage->xParseCell(pPage, pCell, &debuginfo); 001409 #endif 001410 001411 assert( pPage->childPtrSize==0 ); 001412 nSize = *pIter; 001413 if( nSize>=0x80 ){ 001414 pEnd = &pIter[8]; 001415 nSize &= 0x7f; 001416 do{ 001417 nSize = (nSize<<7) | (*++pIter & 0x7f); 001418 }while( *(pIter)>=0x80 && pIter<pEnd ); 001419 } 001420 pIter++; 001421 testcase( nSize==pPage->maxLocal ); 001422 testcase( nSize==(u32)pPage->maxLocal+1 ); 001423 if( nSize<=pPage->maxLocal ){ 001424 nSize += (u32)(pIter - pCell); 001425 if( nSize<4 ) nSize = 4; 001426 }else{ 001427 int minLocal = pPage->minLocal; 001428 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 001429 testcase( nSize==pPage->maxLocal ); 001430 testcase( nSize==(u32)pPage->maxLocal+1 ); 001431 if( nSize>pPage->maxLocal ){ 001432 nSize = minLocal; 001433 } 001434 nSize += 4 + (u16)(pIter - pCell); 001435 } 001436 assert( nSize==debuginfo.nSize || CORRUPT_DB ); 001437 return (u16)nSize; 001438 } 001439 static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){ 001440 u8 *pIter = pCell + 4; /* For looping over bytes of pCell */ 001441 u8 *pEnd; /* End mark for a varint */ 001442 001443 #ifdef SQLITE_DEBUG 001444 /* The value returned by this function should always be the same as 001445 ** the (CellInfo.nSize) value found by doing a full parse of the 001446 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001447 ** this function verifies that this invariant is not violated. */ 001448 CellInfo debuginfo; 001449 pPage->xParseCell(pPage, pCell, &debuginfo); 001450 #else 001451 UNUSED_PARAMETER(pPage); 001452 #endif 001453 001454 assert( pPage->childPtrSize==4 ); 001455 pEnd = pIter + 9; 001456 while( (*pIter++)&0x80 && pIter<pEnd ); 001457 assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB ); 001458 return (u16)(pIter - pCell); 001459 } 001460 static u16 cellSizePtrTableLeaf(MemPage *pPage, u8 *pCell){ 001461 u8 *pIter = pCell; /* For looping over bytes of pCell */ 001462 u8 *pEnd; /* End mark for a varint */ 001463 u32 nSize; /* Size value to return */ 001464 001465 #ifdef SQLITE_DEBUG 001466 /* The value returned by this function should always be the same as 001467 ** the (CellInfo.nSize) value found by doing a full parse of the 001468 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001469 ** this function verifies that this invariant is not violated. */ 001470 CellInfo debuginfo; 001471 pPage->xParseCell(pPage, pCell, &debuginfo); 001472 #endif 001473 001474 nSize = *pIter; 001475 if( nSize>=0x80 ){ 001476 pEnd = &pIter[8]; 001477 nSize &= 0x7f; 001478 do{ 001479 nSize = (nSize<<7) | (*++pIter & 0x7f); 001480 }while( *(pIter)>=0x80 && pIter<pEnd ); 001481 } 001482 pIter++; 001483 /* pIter now points at the 64-bit integer key value, a variable length 001484 ** integer. The following block moves pIter to point at the first byte 001485 ** past the end of the key value. */ 001486 if( (*pIter++)&0x80 001487 && (*pIter++)&0x80 001488 && (*pIter++)&0x80 001489 && (*pIter++)&0x80 001490 && (*pIter++)&0x80 001491 && (*pIter++)&0x80 001492 && (*pIter++)&0x80 001493 && (*pIter++)&0x80 ){ pIter++; } 001494 testcase( nSize==pPage->maxLocal ); 001495 testcase( nSize==(u32)pPage->maxLocal+1 ); 001496 if( nSize<=pPage->maxLocal ){ 001497 nSize += (u32)(pIter - pCell); 001498 if( nSize<4 ) nSize = 4; 001499 }else{ 001500 int minLocal = pPage->minLocal; 001501 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 001502 testcase( nSize==pPage->maxLocal ); 001503 testcase( nSize==(u32)pPage->maxLocal+1 ); 001504 if( nSize>pPage->maxLocal ){ 001505 nSize = minLocal; 001506 } 001507 nSize += 4 + (u16)(pIter - pCell); 001508 } 001509 assert( nSize==debuginfo.nSize || CORRUPT_DB ); 001510 return (u16)nSize; 001511 } 001512 001513 001514 #ifdef SQLITE_DEBUG 001515 /* This variation on cellSizePtr() is used inside of assert() statements 001516 ** only. */ 001517 static u16 cellSize(MemPage *pPage, int iCell){ 001518 return pPage->xCellSize(pPage, findCell(pPage, iCell)); 001519 } 001520 #endif 001521 001522 #ifndef SQLITE_OMIT_AUTOVACUUM 001523 /* 001524 ** The cell pCell is currently part of page pSrc but will ultimately be part 001525 ** of pPage. (pSrc and pPage are often the same.) If pCell contains a 001526 ** pointer to an overflow page, insert an entry into the pointer-map for 001527 ** the overflow page that will be valid after pCell has been moved to pPage. 001528 */ 001529 static void ptrmapPutOvflPtr(MemPage *pPage, MemPage *pSrc, u8 *pCell,int *pRC){ 001530 CellInfo info; 001531 if( *pRC ) return; 001532 assert( pCell!=0 ); 001533 pPage->xParseCell(pPage, pCell, &info); 001534 if( info.nLocal<info.nPayload ){ 001535 Pgno ovfl; 001536 if( SQLITE_OVERFLOW(pSrc->aDataEnd, pCell, pCell+info.nLocal) ){ 001537 testcase( pSrc!=pPage ); 001538 *pRC = SQLITE_CORRUPT_BKPT; 001539 return; 001540 } 001541 ovfl = get4byte(&pCell[info.nSize-4]); 001542 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC); 001543 } 001544 } 001545 #endif 001546 001547 001548 /* 001549 ** Defragment the page given. This routine reorganizes cells within the 001550 ** page so that there are no free-blocks on the free-block list. 001551 ** 001552 ** Parameter nMaxFrag is the maximum amount of fragmented space that may be 001553 ** present in the page after this routine returns. 001554 ** 001555 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a 001556 ** b-tree page so that there are no freeblocks or fragment bytes, all 001557 ** unused bytes are contained in the unallocated space region, and all 001558 ** cells are packed tightly at the end of the page. 001559 */ 001560 static int defragmentPage(MemPage *pPage, int nMaxFrag){ 001561 int i; /* Loop counter */ 001562 int pc; /* Address of the i-th cell */ 001563 int hdr; /* Offset to the page header */ 001564 int size; /* Size of a cell */ 001565 int usableSize; /* Number of usable bytes on a page */ 001566 int cellOffset; /* Offset to the cell pointer array */ 001567 int cbrk; /* Offset to the cell content area */ 001568 int nCell; /* Number of cells on the page */ 001569 unsigned char *data; /* The page data */ 001570 unsigned char *temp; /* Temp area for cell content */ 001571 unsigned char *src; /* Source of content */ 001572 int iCellFirst; /* First allowable cell index */ 001573 int iCellLast; /* Last possible cell index */ 001574 int iCellStart; /* First cell offset in input */ 001575 001576 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001577 assert( pPage->pBt!=0 ); 001578 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); 001579 assert( pPage->nOverflow==0 ); 001580 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001581 data = pPage->aData; 001582 hdr = pPage->hdrOffset; 001583 cellOffset = pPage->cellOffset; 001584 nCell = pPage->nCell; 001585 assert( nCell==get2byte(&data[hdr+3]) || CORRUPT_DB ); 001586 iCellFirst = cellOffset + 2*nCell; 001587 usableSize = pPage->pBt->usableSize; 001588 001589 /* This block handles pages with two or fewer free blocks and nMaxFrag 001590 ** or fewer fragmented bytes. In this case it is faster to move the 001591 ** two (or one) blocks of cells using memmove() and add the required 001592 ** offsets to each pointer in the cell-pointer array than it is to 001593 ** reconstruct the entire page. */ 001594 if( (int)data[hdr+7]<=nMaxFrag ){ 001595 int iFree = get2byte(&data[hdr+1]); 001596 if( iFree>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage); 001597 if( iFree ){ 001598 int iFree2 = get2byte(&data[iFree]); 001599 if( iFree2>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage); 001600 if( 0==iFree2 || (data[iFree2]==0 && data[iFree2+1]==0) ){ 001601 u8 *pEnd = &data[cellOffset + nCell*2]; 001602 u8 *pAddr; 001603 int sz2 = 0; 001604 int sz = get2byte(&data[iFree+2]); 001605 int top = get2byte(&data[hdr+5]); 001606 if( top>=iFree ){ 001607 return SQLITE_CORRUPT_PAGE(pPage); 001608 } 001609 if( iFree2 ){ 001610 if( iFree+sz>iFree2 ) return SQLITE_CORRUPT_PAGE(pPage); 001611 sz2 = get2byte(&data[iFree2+2]); 001612 if( iFree2+sz2 > usableSize ) return SQLITE_CORRUPT_PAGE(pPage); 001613 memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz)); 001614 sz += sz2; 001615 }else if( iFree+sz>usableSize ){ 001616 return SQLITE_CORRUPT_PAGE(pPage); 001617 } 001618 001619 cbrk = top+sz; 001620 assert( cbrk+(iFree-top) <= usableSize ); 001621 memmove(&data[cbrk], &data[top], iFree-top); 001622 for(pAddr=&data[cellOffset]; pAddr<pEnd; pAddr+=2){ 001623 pc = get2byte(pAddr); 001624 if( pc<iFree ){ put2byte(pAddr, pc+sz); } 001625 else if( pc<iFree2 ){ put2byte(pAddr, pc+sz2); } 001626 } 001627 goto defragment_out; 001628 } 001629 } 001630 } 001631 001632 cbrk = usableSize; 001633 iCellLast = usableSize - 4; 001634 iCellStart = get2byte(&data[hdr+5]); 001635 if( nCell>0 ){ 001636 temp = sqlite3PagerTempSpace(pPage->pBt->pPager); 001637 memcpy(temp, data, usableSize); 001638 src = temp; 001639 for(i=0; i<nCell; i++){ 001640 u8 *pAddr; /* The i-th cell pointer */ 001641 pAddr = &data[cellOffset + i*2]; 001642 pc = get2byte(pAddr); 001643 testcase( pc==iCellFirst ); 001644 testcase( pc==iCellLast ); 001645 /* These conditions have already been verified in btreeInitPage() 001646 ** if PRAGMA cell_size_check=ON. 001647 */ 001648 if( pc>iCellLast ){ 001649 return SQLITE_CORRUPT_PAGE(pPage); 001650 } 001651 assert( pc>=0 && pc<=iCellLast ); 001652 size = pPage->xCellSize(pPage, &src[pc]); 001653 cbrk -= size; 001654 if( cbrk<iCellStart || pc+size>usableSize ){ 001655 return SQLITE_CORRUPT_PAGE(pPage); 001656 } 001657 assert( cbrk+size<=usableSize && cbrk>=iCellStart ); 001658 testcase( cbrk+size==usableSize ); 001659 testcase( pc+size==usableSize ); 001660 put2byte(pAddr, cbrk); 001661 memcpy(&data[cbrk], &src[pc], size); 001662 } 001663 } 001664 data[hdr+7] = 0; 001665 001666 defragment_out: 001667 assert( pPage->nFree>=0 ); 001668 if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){ 001669 return SQLITE_CORRUPT_PAGE(pPage); 001670 } 001671 assert( cbrk>=iCellFirst ); 001672 put2byte(&data[hdr+5], cbrk); 001673 data[hdr+1] = 0; 001674 data[hdr+2] = 0; 001675 memset(&data[iCellFirst], 0, cbrk-iCellFirst); 001676 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001677 return SQLITE_OK; 001678 } 001679 001680 /* 001681 ** Search the free-list on page pPg for space to store a cell nByte bytes in 001682 ** size. If one can be found, return a pointer to the space and remove it 001683 ** from the free-list. 001684 ** 001685 ** If no suitable space can be found on the free-list, return NULL. 001686 ** 001687 ** This function may detect corruption within pPg. If corruption is 001688 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned. 001689 ** 001690 ** Slots on the free list that are between 1 and 3 bytes larger than nByte 001691 ** will be ignored if adding the extra space to the fragmentation count 001692 ** causes the fragmentation count to exceed 60. 001693 */ 001694 static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){ 001695 const int hdr = pPg->hdrOffset; /* Offset to page header */ 001696 u8 * const aData = pPg->aData; /* Page data */ 001697 int iAddr = hdr + 1; /* Address of ptr to pc */ 001698 u8 *pTmp = &aData[iAddr]; /* Temporary ptr into aData[] */ 001699 int pc = get2byte(pTmp); /* Address of a free slot */ 001700 int x; /* Excess size of the slot */ 001701 int maxPC = pPg->pBt->usableSize - nByte; /* Max address for a usable slot */ 001702 int size; /* Size of the free slot */ 001703 001704 assert( pc>0 ); 001705 while( pc<=maxPC ){ 001706 /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each 001707 ** freeblock form a big-endian integer which is the size of the freeblock 001708 ** in bytes, including the 4-byte header. */ 001709 pTmp = &aData[pc+2]; 001710 size = get2byte(pTmp); 001711 if( (x = size - nByte)>=0 ){ 001712 testcase( x==4 ); 001713 testcase( x==3 ); 001714 if( x<4 ){ 001715 /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total 001716 ** number of bytes in fragments may not exceed 60. */ 001717 if( aData[hdr+7]>57 ) return 0; 001718 001719 /* Remove the slot from the free-list. Update the number of 001720 ** fragmented bytes within the page. */ 001721 memcpy(&aData[iAddr], &aData[pc], 2); 001722 aData[hdr+7] += (u8)x; 001723 return &aData[pc]; 001724 }else if( x+pc > maxPC ){ 001725 /* This slot extends off the end of the usable part of the page */ 001726 *pRc = SQLITE_CORRUPT_PAGE(pPg); 001727 return 0; 001728 }else{ 001729 /* The slot remains on the free-list. Reduce its size to account 001730 ** for the portion used by the new allocation. */ 001731 put2byte(&aData[pc+2], x); 001732 } 001733 return &aData[pc + x]; 001734 } 001735 iAddr = pc; 001736 pTmp = &aData[pc]; 001737 pc = get2byte(pTmp); 001738 if( pc<=iAddr ){ 001739 if( pc ){ 001740 /* The next slot in the chain comes before the current slot */ 001741 *pRc = SQLITE_CORRUPT_PAGE(pPg); 001742 } 001743 return 0; 001744 } 001745 } 001746 if( pc>maxPC+nByte-4 ){ 001747 /* The free slot chain extends off the end of the page */ 001748 *pRc = SQLITE_CORRUPT_PAGE(pPg); 001749 } 001750 return 0; 001751 } 001752 001753 /* 001754 ** Allocate nByte bytes of space from within the B-Tree page passed 001755 ** as the first argument. Write into *pIdx the index into pPage->aData[] 001756 ** of the first byte of allocated space. Return either SQLITE_OK or 001757 ** an error code (usually SQLITE_CORRUPT). 001758 ** 001759 ** The caller guarantees that there is sufficient space to make the 001760 ** allocation. This routine might need to defragment in order to bring 001761 ** all the space together, however. This routine will avoid using 001762 ** the first two bytes past the cell pointer area since presumably this 001763 ** allocation is being made in order to insert a new cell, so we will 001764 ** also end up needing a new cell pointer. 001765 */ 001766 static SQLITE_INLINE int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ 001767 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */ 001768 u8 * const data = pPage->aData; /* Local cache of pPage->aData */ 001769 int top; /* First byte of cell content area */ 001770 int rc = SQLITE_OK; /* Integer return code */ 001771 u8 *pTmp; /* Temp ptr into data[] */ 001772 int gap; /* First byte of gap between cell pointers and cell content */ 001773 001774 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001775 assert( pPage->pBt ); 001776 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001777 assert( nByte>=0 ); /* Minimum cell size is 4 */ 001778 assert( pPage->nFree>=nByte ); 001779 assert( pPage->nOverflow==0 ); 001780 assert( nByte < (int)(pPage->pBt->usableSize-8) ); 001781 001782 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf ); 001783 gap = pPage->cellOffset + 2*pPage->nCell; 001784 assert( gap<=65536 ); 001785 /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size 001786 ** and the reserved space is zero (the usual value for reserved space) 001787 ** then the cell content offset of an empty page wants to be 65536. 001788 ** However, that integer is too large to be stored in a 2-byte unsigned 001789 ** integer, so a value of 0 is used in its place. */ 001790 pTmp = &data[hdr+5]; 001791 top = get2byte(pTmp); 001792 if( gap>top ){ 001793 if( top==0 && pPage->pBt->usableSize==65536 ){ 001794 top = 65536; 001795 }else{ 001796 return SQLITE_CORRUPT_PAGE(pPage); 001797 } 001798 }else if( top>(int)pPage->pBt->usableSize ){ 001799 return SQLITE_CORRUPT_PAGE(pPage); 001800 } 001801 001802 /* If there is enough space between gap and top for one more cell pointer, 001803 ** and if the freelist is not empty, then search the 001804 ** freelist looking for a slot big enough to satisfy the request. 001805 */ 001806 testcase( gap+2==top ); 001807 testcase( gap+1==top ); 001808 testcase( gap==top ); 001809 if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){ 001810 u8 *pSpace = pageFindSlot(pPage, nByte, &rc); 001811 if( pSpace ){ 001812 int g2; 001813 assert( pSpace+nByte<=data+pPage->pBt->usableSize ); 001814 *pIdx = g2 = (int)(pSpace-data); 001815 if( g2<=gap ){ 001816 return SQLITE_CORRUPT_PAGE(pPage); 001817 }else{ 001818 return SQLITE_OK; 001819 } 001820 }else if( rc ){ 001821 return rc; 001822 } 001823 } 001824 001825 /* The request could not be fulfilled using a freelist slot. Check 001826 ** to see if defragmentation is necessary. 001827 */ 001828 testcase( gap+2+nByte==top ); 001829 if( gap+2+nByte>top ){ 001830 assert( pPage->nCell>0 || CORRUPT_DB ); 001831 assert( pPage->nFree>=0 ); 001832 rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte))); 001833 if( rc ) return rc; 001834 top = get2byteNotZero(&data[hdr+5]); 001835 assert( gap+2+nByte<=top ); 001836 } 001837 001838 001839 /* Allocate memory from the gap in between the cell pointer array 001840 ** and the cell content area. The btreeComputeFreeSpace() call has already 001841 ** validated the freelist. Given that the freelist is valid, there 001842 ** is no way that the allocation can extend off the end of the page. 001843 ** The assert() below verifies the previous sentence. 001844 */ 001845 top -= nByte; 001846 put2byte(&data[hdr+5], top); 001847 assert( top+nByte <= (int)pPage->pBt->usableSize ); 001848 *pIdx = top; 001849 return SQLITE_OK; 001850 } 001851 001852 /* 001853 ** Return a section of the pPage->aData to the freelist. 001854 ** The first byte of the new free block is pPage->aData[iStart] 001855 ** and the size of the block is iSize bytes. 001856 ** 001857 ** Adjacent freeblocks are coalesced. 001858 ** 001859 ** Even though the freeblock list was checked by btreeComputeFreeSpace(), 001860 ** that routine will not detect overlap between cells or freeblocks. Nor 001861 ** does it detect cells or freeblocks that encroach into the reserved bytes 001862 ** at the end of the page. So do additional corruption checks inside this 001863 ** routine and return SQLITE_CORRUPT if any problems are found. 001864 */ 001865 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ 001866 u16 iPtr; /* Address of ptr to next freeblock */ 001867 u16 iFreeBlk; /* Address of the next freeblock */ 001868 u8 hdr; /* Page header size. 0 or 100 */ 001869 u8 nFrag = 0; /* Reduction in fragmentation */ 001870 u16 iOrigSize = iSize; /* Original value of iSize */ 001871 u16 x; /* Offset to cell content area */ 001872 u32 iEnd = iStart + iSize; /* First byte past the iStart buffer */ 001873 unsigned char *data = pPage->aData; /* Page content */ 001874 u8 *pTmp; /* Temporary ptr into data[] */ 001875 001876 assert( pPage->pBt!=0 ); 001877 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001878 assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize ); 001879 assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize ); 001880 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001881 assert( iSize>=4 ); /* Minimum cell size is 4 */ 001882 assert( CORRUPT_DB || iStart<=pPage->pBt->usableSize-4 ); 001883 001884 /* The list of freeblocks must be in ascending order. Find the 001885 ** spot on the list where iStart should be inserted. 001886 */ 001887 hdr = pPage->hdrOffset; 001888 iPtr = hdr + 1; 001889 if( data[iPtr+1]==0 && data[iPtr]==0 ){ 001890 iFreeBlk = 0; /* Shortcut for the case when the freelist is empty */ 001891 }else{ 001892 while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){ 001893 if( iFreeBlk<=iPtr ){ 001894 if( iFreeBlk==0 ) break; /* TH3: corrupt082.100 */ 001895 return SQLITE_CORRUPT_PAGE(pPage); 001896 } 001897 iPtr = iFreeBlk; 001898 } 001899 if( iFreeBlk>pPage->pBt->usableSize-4 ){ /* TH3: corrupt081.100 */ 001900 return SQLITE_CORRUPT_PAGE(pPage); 001901 } 001902 assert( iFreeBlk>iPtr || iFreeBlk==0 || CORRUPT_DB ); 001903 001904 /* At this point: 001905 ** iFreeBlk: First freeblock after iStart, or zero if none 001906 ** iPtr: The address of a pointer to iFreeBlk 001907 ** 001908 ** Check to see if iFreeBlk should be coalesced onto the end of iStart. 001909 */ 001910 if( iFreeBlk && iEnd+3>=iFreeBlk ){ 001911 nFrag = iFreeBlk - iEnd; 001912 if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_PAGE(pPage); 001913 iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]); 001914 if( iEnd > pPage->pBt->usableSize ){ 001915 return SQLITE_CORRUPT_PAGE(pPage); 001916 } 001917 iSize = iEnd - iStart; 001918 iFreeBlk = get2byte(&data[iFreeBlk]); 001919 } 001920 001921 /* If iPtr is another freeblock (that is, if iPtr is not the freelist 001922 ** pointer in the page header) then check to see if iStart should be 001923 ** coalesced onto the end of iPtr. 001924 */ 001925 if( iPtr>hdr+1 ){ 001926 int iPtrEnd = iPtr + get2byte(&data[iPtr+2]); 001927 if( iPtrEnd+3>=iStart ){ 001928 if( iPtrEnd>iStart ) return SQLITE_CORRUPT_PAGE(pPage); 001929 nFrag += iStart - iPtrEnd; 001930 iSize = iEnd - iPtr; 001931 iStart = iPtr; 001932 } 001933 } 001934 if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_PAGE(pPage); 001935 data[hdr+7] -= nFrag; 001936 } 001937 pTmp = &data[hdr+5]; 001938 x = get2byte(pTmp); 001939 if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){ 001940 /* Overwrite deleted information with zeros when the secure_delete 001941 ** option is enabled */ 001942 memset(&data[iStart], 0, iSize); 001943 } 001944 if( iStart<=x ){ 001945 /* The new freeblock is at the beginning of the cell content area, 001946 ** so just extend the cell content area rather than create another 001947 ** freelist entry */ 001948 if( iStart<x ) return SQLITE_CORRUPT_PAGE(pPage); 001949 if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_PAGE(pPage); 001950 put2byte(&data[hdr+1], iFreeBlk); 001951 put2byte(&data[hdr+5], iEnd); 001952 }else{ 001953 /* Insert the new freeblock into the freelist */ 001954 put2byte(&data[iPtr], iStart); 001955 put2byte(&data[iStart], iFreeBlk); 001956 put2byte(&data[iStart+2], iSize); 001957 } 001958 pPage->nFree += iOrigSize; 001959 return SQLITE_OK; 001960 } 001961 001962 /* 001963 ** Decode the flags byte (the first byte of the header) for a page 001964 ** and initialize fields of the MemPage structure accordingly. 001965 ** 001966 ** Only the following combinations are supported. Anything different 001967 ** indicates a corrupt database files: 001968 ** 001969 ** PTF_ZERODATA (0x02, 2) 001970 ** PTF_LEAFDATA | PTF_INTKEY (0x05, 5) 001971 ** PTF_ZERODATA | PTF_LEAF (0x0a, 10) 001972 ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF (0x0d, 13) 001973 */ 001974 static int decodeFlags(MemPage *pPage, int flagByte){ 001975 BtShared *pBt; /* A copy of pPage->pBt */ 001976 001977 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) ); 001978 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001979 pBt = pPage->pBt; 001980 pPage->max1bytePayload = pBt->max1bytePayload; 001981 if( flagByte>=(PTF_ZERODATA | PTF_LEAF) ){ 001982 pPage->childPtrSize = 0; 001983 pPage->leaf = 1; 001984 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF) ){ 001985 pPage->intKeyLeaf = 1; 001986 pPage->xCellSize = cellSizePtrTableLeaf; 001987 pPage->xParseCell = btreeParseCellPtr; 001988 pPage->intKey = 1; 001989 pPage->maxLocal = pBt->maxLeaf; 001990 pPage->minLocal = pBt->minLeaf; 001991 }else if( flagByte==(PTF_ZERODATA | PTF_LEAF) ){ 001992 pPage->intKey = 0; 001993 pPage->intKeyLeaf = 0; 001994 pPage->xCellSize = cellSizePtrIdxLeaf; 001995 pPage->xParseCell = btreeParseCellPtrIndex; 001996 pPage->maxLocal = pBt->maxLocal; 001997 pPage->minLocal = pBt->minLocal; 001998 }else{ 001999 pPage->intKey = 0; 002000 pPage->intKeyLeaf = 0; 002001 pPage->xCellSize = cellSizePtrIdxLeaf; 002002 pPage->xParseCell = btreeParseCellPtrIndex; 002003 return SQLITE_CORRUPT_PAGE(pPage); 002004 } 002005 }else{ 002006 pPage->childPtrSize = 4; 002007 pPage->leaf = 0; 002008 if( flagByte==(PTF_ZERODATA) ){ 002009 pPage->intKey = 0; 002010 pPage->intKeyLeaf = 0; 002011 pPage->xCellSize = cellSizePtr; 002012 pPage->xParseCell = btreeParseCellPtrIndex; 002013 pPage->maxLocal = pBt->maxLocal; 002014 pPage->minLocal = pBt->minLocal; 002015 }else if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ 002016 pPage->intKeyLeaf = 0; 002017 pPage->xCellSize = cellSizePtrNoPayload; 002018 pPage->xParseCell = btreeParseCellPtrNoPayload; 002019 pPage->intKey = 1; 002020 pPage->maxLocal = pBt->maxLeaf; 002021 pPage->minLocal = pBt->minLeaf; 002022 }else{ 002023 pPage->intKey = 0; 002024 pPage->intKeyLeaf = 0; 002025 pPage->xCellSize = cellSizePtr; 002026 pPage->xParseCell = btreeParseCellPtrIndex; 002027 return SQLITE_CORRUPT_PAGE(pPage); 002028 } 002029 } 002030 return SQLITE_OK; 002031 } 002032 002033 /* 002034 ** Compute the amount of freespace on the page. In other words, fill 002035 ** in the pPage->nFree field. 002036 */ 002037 static int btreeComputeFreeSpace(MemPage *pPage){ 002038 int pc; /* Address of a freeblock within pPage->aData[] */ 002039 u8 hdr; /* Offset to beginning of page header */ 002040 u8 *data; /* Equal to pPage->aData */ 002041 int usableSize; /* Amount of usable space on each page */ 002042 int nFree; /* Number of unused bytes on the page */ 002043 int top; /* First byte of the cell content area */ 002044 int iCellFirst; /* First allowable cell or freeblock offset */ 002045 int iCellLast; /* Last possible cell or freeblock offset */ 002046 002047 assert( pPage->pBt!=0 ); 002048 assert( pPage->pBt->db!=0 ); 002049 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002050 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 002051 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 002052 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 002053 assert( pPage->isInit==1 ); 002054 assert( pPage->nFree<0 ); 002055 002056 usableSize = pPage->pBt->usableSize; 002057 hdr = pPage->hdrOffset; 002058 data = pPage->aData; 002059 /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates 002060 ** the start of the cell content area. A zero value for this integer is 002061 ** interpreted as 65536. */ 002062 top = get2byteNotZero(&data[hdr+5]); 002063 iCellFirst = hdr + 8 + pPage->childPtrSize + 2*pPage->nCell; 002064 iCellLast = usableSize - 4; 002065 002066 /* Compute the total free space on the page 002067 ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the 002068 ** start of the first freeblock on the page, or is zero if there are no 002069 ** freeblocks. */ 002070 pc = get2byte(&data[hdr+1]); 002071 nFree = data[hdr+7] + top; /* Init nFree to non-freeblock free space */ 002072 if( pc>0 ){ 002073 u32 next, size; 002074 if( pc<top ){ 002075 /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will 002076 ** always be at least one cell before the first freeblock. 002077 */ 002078 return SQLITE_CORRUPT_PAGE(pPage); 002079 } 002080 while( 1 ){ 002081 if( pc>iCellLast ){ 002082 /* Freeblock off the end of the page */ 002083 return SQLITE_CORRUPT_PAGE(pPage); 002084 } 002085 next = get2byte(&data[pc]); 002086 size = get2byte(&data[pc+2]); 002087 nFree = nFree + size; 002088 if( next<=pc+size+3 ) break; 002089 pc = next; 002090 } 002091 if( next>0 ){ 002092 /* Freeblock not in ascending order */ 002093 return SQLITE_CORRUPT_PAGE(pPage); 002094 } 002095 if( pc+size>(unsigned int)usableSize ){ 002096 /* Last freeblock extends past page end */ 002097 return SQLITE_CORRUPT_PAGE(pPage); 002098 } 002099 } 002100 002101 /* At this point, nFree contains the sum of the offset to the start 002102 ** of the cell-content area plus the number of free bytes within 002103 ** the cell-content area. If this is greater than the usable-size 002104 ** of the page, then the page must be corrupted. This check also 002105 ** serves to verify that the offset to the start of the cell-content 002106 ** area, according to the page header, lies within the page. 002107 */ 002108 if( nFree>usableSize || nFree<iCellFirst ){ 002109 return SQLITE_CORRUPT_PAGE(pPage); 002110 } 002111 pPage->nFree = (u16)(nFree - iCellFirst); 002112 return SQLITE_OK; 002113 } 002114 002115 /* 002116 ** Do additional sanity check after btreeInitPage() if 002117 ** PRAGMA cell_size_check=ON 002118 */ 002119 static SQLITE_NOINLINE int btreeCellSizeCheck(MemPage *pPage){ 002120 int iCellFirst; /* First allowable cell or freeblock offset */ 002121 int iCellLast; /* Last possible cell or freeblock offset */ 002122 int i; /* Index into the cell pointer array */ 002123 int sz; /* Size of a cell */ 002124 int pc; /* Address of a freeblock within pPage->aData[] */ 002125 u8 *data; /* Equal to pPage->aData */ 002126 int usableSize; /* Maximum usable space on the page */ 002127 int cellOffset; /* Start of cell content area */ 002128 002129 iCellFirst = pPage->cellOffset + 2*pPage->nCell; 002130 usableSize = pPage->pBt->usableSize; 002131 iCellLast = usableSize - 4; 002132 data = pPage->aData; 002133 cellOffset = pPage->cellOffset; 002134 if( !pPage->leaf ) iCellLast--; 002135 for(i=0; i<pPage->nCell; i++){ 002136 pc = get2byteAligned(&data[cellOffset+i*2]); 002137 testcase( pc==iCellFirst ); 002138 testcase( pc==iCellLast ); 002139 if( pc<iCellFirst || pc>iCellLast ){ 002140 return SQLITE_CORRUPT_PAGE(pPage); 002141 } 002142 sz = pPage->xCellSize(pPage, &data[pc]); 002143 testcase( pc+sz==usableSize ); 002144 if( pc+sz>usableSize ){ 002145 return SQLITE_CORRUPT_PAGE(pPage); 002146 } 002147 } 002148 return SQLITE_OK; 002149 } 002150 002151 /* 002152 ** Initialize the auxiliary information for a disk block. 002153 ** 002154 ** Return SQLITE_OK on success. If we see that the page does 002155 ** not contain a well-formed database page, then return 002156 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not 002157 ** guarantee that the page is well-formed. It only shows that 002158 ** we failed to detect any corruption. 002159 */ 002160 static int btreeInitPage(MemPage *pPage){ 002161 u8 *data; /* Equal to pPage->aData */ 002162 BtShared *pBt; /* The main btree structure */ 002163 002164 assert( pPage->pBt!=0 ); 002165 assert( pPage->pBt->db!=0 ); 002166 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002167 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 002168 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 002169 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 002170 assert( pPage->isInit==0 ); 002171 002172 pBt = pPage->pBt; 002173 data = pPage->aData + pPage->hdrOffset; 002174 /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating 002175 ** the b-tree page type. */ 002176 if( decodeFlags(pPage, data[0]) ){ 002177 return SQLITE_CORRUPT_PAGE(pPage); 002178 } 002179 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); 002180 pPage->maskPage = (u16)(pBt->pageSize - 1); 002181 pPage->nOverflow = 0; 002182 pPage->cellOffset = pPage->hdrOffset + 8 + pPage->childPtrSize; 002183 pPage->aCellIdx = data + pPage->childPtrSize + 8; 002184 pPage->aDataEnd = pPage->aData + pBt->pageSize; 002185 pPage->aDataOfst = pPage->aData + pPage->childPtrSize; 002186 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the 002187 ** number of cells on the page. */ 002188 pPage->nCell = get2byte(&data[3]); 002189 if( pPage->nCell>MX_CELL(pBt) ){ 002190 /* To many cells for a single page. The page must be corrupt */ 002191 return SQLITE_CORRUPT_PAGE(pPage); 002192 } 002193 testcase( pPage->nCell==MX_CELL(pBt) ); 002194 /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only 002195 ** possible for a root page of a table that contains no rows) then the 002196 ** offset to the cell content area will equal the page size minus the 002197 ** bytes of reserved space. */ 002198 assert( pPage->nCell>0 002199 || get2byteNotZero(&data[5])==(int)pBt->usableSize 002200 || CORRUPT_DB ); 002201 pPage->nFree = -1; /* Indicate that this value is yet uncomputed */ 002202 pPage->isInit = 1; 002203 if( pBt->db->flags & SQLITE_CellSizeCk ){ 002204 return btreeCellSizeCheck(pPage); 002205 } 002206 return SQLITE_OK; 002207 } 002208 002209 /* 002210 ** Set up a raw page so that it looks like a database page holding 002211 ** no entries. 002212 */ 002213 static void zeroPage(MemPage *pPage, int flags){ 002214 unsigned char *data = pPage->aData; 002215 BtShared *pBt = pPage->pBt; 002216 u8 hdr = pPage->hdrOffset; 002217 u16 first; 002218 002219 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno || CORRUPT_DB ); 002220 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 002221 assert( sqlite3PagerGetData(pPage->pDbPage) == data ); 002222 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 002223 assert( sqlite3_mutex_held(pBt->mutex) ); 002224 if( pBt->btsFlags & BTS_FAST_SECURE ){ 002225 memset(&data[hdr], 0, pBt->usableSize - hdr); 002226 } 002227 data[hdr] = (char)flags; 002228 first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8); 002229 memset(&data[hdr+1], 0, 4); 002230 data[hdr+7] = 0; 002231 put2byte(&data[hdr+5], pBt->usableSize); 002232 pPage->nFree = (u16)(pBt->usableSize - first); 002233 decodeFlags(pPage, flags); 002234 pPage->cellOffset = first; 002235 pPage->aDataEnd = &data[pBt->pageSize]; 002236 pPage->aCellIdx = &data[first]; 002237 pPage->aDataOfst = &data[pPage->childPtrSize]; 002238 pPage->nOverflow = 0; 002239 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); 002240 pPage->maskPage = (u16)(pBt->pageSize - 1); 002241 pPage->nCell = 0; 002242 pPage->isInit = 1; 002243 } 002244 002245 002246 /* 002247 ** Convert a DbPage obtained from the pager into a MemPage used by 002248 ** the btree layer. 002249 */ 002250 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){ 002251 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); 002252 if( pgno!=pPage->pgno ){ 002253 pPage->aData = sqlite3PagerGetData(pDbPage); 002254 pPage->pDbPage = pDbPage; 002255 pPage->pBt = pBt; 002256 pPage->pgno = pgno; 002257 pPage->hdrOffset = pgno==1 ? 100 : 0; 002258 } 002259 assert( pPage->aData==sqlite3PagerGetData(pDbPage) ); 002260 return pPage; 002261 } 002262 002263 /* 002264 ** Get a page from the pager. Initialize the MemPage.pBt and 002265 ** MemPage.aData elements if needed. See also: btreeGetUnusedPage(). 002266 ** 002267 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care 002268 ** about the content of the page at this time. So do not go to the disk 002269 ** to fetch the content. Just fill in the content with zeros for now. 002270 ** If in the future we call sqlite3PagerWrite() on this page, that 002271 ** means we have started to be concerned about content and the disk 002272 ** read should occur at that point. 002273 */ 002274 static int btreeGetPage( 002275 BtShared *pBt, /* The btree */ 002276 Pgno pgno, /* Number of the page to fetch */ 002277 MemPage **ppPage, /* Return the page in this parameter */ 002278 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */ 002279 ){ 002280 int rc; 002281 DbPage *pDbPage; 002282 002283 assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY ); 002284 assert( sqlite3_mutex_held(pBt->mutex) ); 002285 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags); 002286 if( rc ) return rc; 002287 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt); 002288 return SQLITE_OK; 002289 } 002290 002291 /* 002292 ** Retrieve a page from the pager cache. If the requested page is not 002293 ** already in the pager cache return NULL. Initialize the MemPage.pBt and 002294 ** MemPage.aData elements if needed. 002295 */ 002296 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){ 002297 DbPage *pDbPage; 002298 assert( sqlite3_mutex_held(pBt->mutex) ); 002299 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); 002300 if( pDbPage ){ 002301 return btreePageFromDbPage(pDbPage, pgno, pBt); 002302 } 002303 return 0; 002304 } 002305 002306 /* 002307 ** Return the size of the database file in pages. If there is any kind of 002308 ** error, return ((unsigned int)-1). 002309 */ 002310 static Pgno btreePagecount(BtShared *pBt){ 002311 return pBt->nPage; 002312 } 002313 Pgno sqlite3BtreeLastPage(Btree *p){ 002314 assert( sqlite3BtreeHoldsMutex(p) ); 002315 return btreePagecount(p->pBt); 002316 } 002317 002318 /* 002319 ** Get a page from the pager and initialize it. 002320 */ 002321 static int getAndInitPage( 002322 BtShared *pBt, /* The database file */ 002323 Pgno pgno, /* Number of the page to get */ 002324 MemPage **ppPage, /* Write the page pointer here */ 002325 int bReadOnly /* True for a read-only page */ 002326 ){ 002327 int rc; 002328 DbPage *pDbPage; 002329 MemPage *pPage; 002330 assert( sqlite3_mutex_held(pBt->mutex) ); 002331 002332 if( pgno>btreePagecount(pBt) ){ 002333 *ppPage = 0; 002334 return SQLITE_CORRUPT_BKPT; 002335 } 002336 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly); 002337 if( rc ){ 002338 *ppPage = 0; 002339 return rc; 002340 } 002341 pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); 002342 if( pPage->isInit==0 ){ 002343 btreePageFromDbPage(pDbPage, pgno, pBt); 002344 rc = btreeInitPage(pPage); 002345 if( rc!=SQLITE_OK ){ 002346 releasePage(pPage); 002347 *ppPage = 0; 002348 return rc; 002349 } 002350 } 002351 assert( pPage->pgno==pgno || CORRUPT_DB ); 002352 assert( pPage->aData==sqlite3PagerGetData(pDbPage) ); 002353 *ppPage = pPage; 002354 return SQLITE_OK; 002355 } 002356 002357 /* 002358 ** Release a MemPage. This should be called once for each prior 002359 ** call to btreeGetPage. 002360 ** 002361 ** Page1 is a special case and must be released using releasePageOne(). 002362 */ 002363 static void releasePageNotNull(MemPage *pPage){ 002364 assert( pPage->aData ); 002365 assert( pPage->pBt ); 002366 assert( pPage->pDbPage!=0 ); 002367 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 002368 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 002369 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002370 sqlite3PagerUnrefNotNull(pPage->pDbPage); 002371 } 002372 static void releasePage(MemPage *pPage){ 002373 if( pPage ) releasePageNotNull(pPage); 002374 } 002375 static void releasePageOne(MemPage *pPage){ 002376 assert( pPage!=0 ); 002377 assert( pPage->aData ); 002378 assert( pPage->pBt ); 002379 assert( pPage->pDbPage!=0 ); 002380 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 002381 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 002382 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002383 sqlite3PagerUnrefPageOne(pPage->pDbPage); 002384 } 002385 002386 /* 002387 ** Get an unused page. 002388 ** 002389 ** This works just like btreeGetPage() with the addition: 002390 ** 002391 ** * If the page is already in use for some other purpose, immediately 002392 ** release it and return an SQLITE_CURRUPT error. 002393 ** * Make sure the isInit flag is clear 002394 */ 002395 static int btreeGetUnusedPage( 002396 BtShared *pBt, /* The btree */ 002397 Pgno pgno, /* Number of the page to fetch */ 002398 MemPage **ppPage, /* Return the page in this parameter */ 002399 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */ 002400 ){ 002401 int rc = btreeGetPage(pBt, pgno, ppPage, flags); 002402 if( rc==SQLITE_OK ){ 002403 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){ 002404 releasePage(*ppPage); 002405 *ppPage = 0; 002406 return SQLITE_CORRUPT_BKPT; 002407 } 002408 (*ppPage)->isInit = 0; 002409 }else{ 002410 *ppPage = 0; 002411 } 002412 return rc; 002413 } 002414 002415 002416 /* 002417 ** During a rollback, when the pager reloads information into the cache 002418 ** so that the cache is restored to its original state at the start of 002419 ** the transaction, for each page restored this routine is called. 002420 ** 002421 ** This routine needs to reset the extra data section at the end of the 002422 ** page to agree with the restored data. 002423 */ 002424 static void pageReinit(DbPage *pData){ 002425 MemPage *pPage; 002426 pPage = (MemPage *)sqlite3PagerGetExtra(pData); 002427 assert( sqlite3PagerPageRefcount(pData)>0 ); 002428 if( pPage->isInit ){ 002429 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002430 pPage->isInit = 0; 002431 if( sqlite3PagerPageRefcount(pData)>1 ){ 002432 /* pPage might not be a btree page; it might be an overflow page 002433 ** or ptrmap page or a free page. In those cases, the following 002434 ** call to btreeInitPage() will likely return SQLITE_CORRUPT. 002435 ** But no harm is done by this. And it is very important that 002436 ** btreeInitPage() be called on every btree page so we make 002437 ** the call for every page that comes in for re-initializing. */ 002438 btreeInitPage(pPage); 002439 } 002440 } 002441 } 002442 002443 /* 002444 ** Invoke the busy handler for a btree. 002445 */ 002446 static int btreeInvokeBusyHandler(void *pArg){ 002447 BtShared *pBt = (BtShared*)pArg; 002448 assert( pBt->db ); 002449 assert( sqlite3_mutex_held(pBt->db->mutex) ); 002450 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler); 002451 } 002452 002453 /* 002454 ** Open a database file. 002455 ** 002456 ** zFilename is the name of the database file. If zFilename is NULL 002457 ** then an ephemeral database is created. The ephemeral database might 002458 ** be exclusively in memory, or it might use a disk-based memory cache. 002459 ** Either way, the ephemeral database will be automatically deleted 002460 ** when sqlite3BtreeClose() is called. 002461 ** 002462 ** If zFilename is ":memory:" then an in-memory database is created 002463 ** that is automatically destroyed when it is closed. 002464 ** 002465 ** The "flags" parameter is a bitmask that might contain bits like 002466 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY. 002467 ** 002468 ** If the database is already opened in the same database connection 002469 ** and we are in shared cache mode, then the open will fail with an 002470 ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared 002471 ** objects in the same database connection since doing so will lead 002472 ** to problems with locking. 002473 */ 002474 int sqlite3BtreeOpen( 002475 sqlite3_vfs *pVfs, /* VFS to use for this b-tree */ 002476 const char *zFilename, /* Name of the file containing the BTree database */ 002477 sqlite3 *db, /* Associated database handle */ 002478 Btree **ppBtree, /* Pointer to new Btree object written here */ 002479 int flags, /* Options */ 002480 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */ 002481 ){ 002482 BtShared *pBt = 0; /* Shared part of btree structure */ 002483 Btree *p; /* Handle to return */ 002484 sqlite3_mutex *mutexOpen = 0; /* Prevents a race condition. Ticket #3537 */ 002485 int rc = SQLITE_OK; /* Result code from this function */ 002486 u8 nReserve; /* Byte of unused space on each page */ 002487 unsigned char zDbHeader[100]; /* Database header content */ 002488 002489 /* True if opening an ephemeral, temporary database */ 002490 const int isTempDb = zFilename==0 || zFilename[0]==0; 002491 002492 /* Set the variable isMemdb to true for an in-memory database, or 002493 ** false for a file-based database. 002494 */ 002495 #ifdef SQLITE_OMIT_MEMORYDB 002496 const int isMemdb = 0; 002497 #else 002498 const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0) 002499 || (isTempDb && sqlite3TempInMemory(db)) 002500 || (vfsFlags & SQLITE_OPEN_MEMORY)!=0; 002501 #endif 002502 002503 assert( db!=0 ); 002504 assert( pVfs!=0 ); 002505 assert( sqlite3_mutex_held(db->mutex) ); 002506 assert( (flags&0xff)==flags ); /* flags fit in 8 bits */ 002507 002508 /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */ 002509 assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 ); 002510 002511 /* A BTREE_SINGLE database is always a temporary and/or ephemeral */ 002512 assert( (flags & BTREE_SINGLE)==0 || isTempDb ); 002513 002514 if( isMemdb ){ 002515 flags |= BTREE_MEMORY; 002516 } 002517 if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){ 002518 vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB; 002519 } 002520 p = sqlite3MallocZero(sizeof(Btree)); 002521 if( !p ){ 002522 return SQLITE_NOMEM_BKPT; 002523 } 002524 p->inTrans = TRANS_NONE; 002525 p->db = db; 002526 #ifndef SQLITE_OMIT_SHARED_CACHE 002527 p->lock.pBtree = p; 002528 p->lock.iTable = 1; 002529 #endif 002530 002531 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 002532 /* 002533 ** If this Btree is a candidate for shared cache, try to find an 002534 ** existing BtShared object that we can share with 002535 */ 002536 if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){ 002537 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){ 002538 int nFilename = sqlite3Strlen30(zFilename)+1; 002539 int nFullPathname = pVfs->mxPathname+1; 002540 char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename)); 002541 MUTEX_LOGIC( sqlite3_mutex *mutexShared; ) 002542 002543 p->sharable = 1; 002544 if( !zFullPathname ){ 002545 sqlite3_free(p); 002546 return SQLITE_NOMEM_BKPT; 002547 } 002548 if( isMemdb ){ 002549 memcpy(zFullPathname, zFilename, nFilename); 002550 }else{ 002551 rc = sqlite3OsFullPathname(pVfs, zFilename, 002552 nFullPathname, zFullPathname); 002553 if( rc ){ 002554 if( rc==SQLITE_OK_SYMLINK ){ 002555 rc = SQLITE_OK; 002556 }else{ 002557 sqlite3_free(zFullPathname); 002558 sqlite3_free(p); 002559 return rc; 002560 } 002561 } 002562 } 002563 #if SQLITE_THREADSAFE 002564 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN); 002565 sqlite3_mutex_enter(mutexOpen); 002566 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN); 002567 sqlite3_mutex_enter(mutexShared); 002568 #endif 002569 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){ 002570 assert( pBt->nRef>0 ); 002571 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0)) 002572 && sqlite3PagerVfs(pBt->pPager)==pVfs ){ 002573 int iDb; 002574 for(iDb=db->nDb-1; iDb>=0; iDb--){ 002575 Btree *pExisting = db->aDb[iDb].pBt; 002576 if( pExisting && pExisting->pBt==pBt ){ 002577 sqlite3_mutex_leave(mutexShared); 002578 sqlite3_mutex_leave(mutexOpen); 002579 sqlite3_free(zFullPathname); 002580 sqlite3_free(p); 002581 return SQLITE_CONSTRAINT; 002582 } 002583 } 002584 p->pBt = pBt; 002585 pBt->nRef++; 002586 break; 002587 } 002588 } 002589 sqlite3_mutex_leave(mutexShared); 002590 sqlite3_free(zFullPathname); 002591 } 002592 #ifdef SQLITE_DEBUG 002593 else{ 002594 /* In debug mode, we mark all persistent databases as sharable 002595 ** even when they are not. This exercises the locking code and 002596 ** gives more opportunity for asserts(sqlite3_mutex_held()) 002597 ** statements to find locking problems. 002598 */ 002599 p->sharable = 1; 002600 } 002601 #endif 002602 } 002603 #endif 002604 if( pBt==0 ){ 002605 /* 002606 ** The following asserts make sure that structures used by the btree are 002607 ** the right size. This is to guard against size changes that result 002608 ** when compiling on a different architecture. 002609 */ 002610 assert( sizeof(i64)==8 ); 002611 assert( sizeof(u64)==8 ); 002612 assert( sizeof(u32)==4 ); 002613 assert( sizeof(u16)==2 ); 002614 assert( sizeof(Pgno)==4 ); 002615 002616 /* Suppress false-positive compiler warning from PVS-Studio */ 002617 memset(&zDbHeader[16], 0, 8); 002618 002619 pBt = sqlite3MallocZero( sizeof(*pBt) ); 002620 if( pBt==0 ){ 002621 rc = SQLITE_NOMEM_BKPT; 002622 goto btree_open_out; 002623 } 002624 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename, 002625 sizeof(MemPage), flags, vfsFlags, pageReinit); 002626 if( rc==SQLITE_OK ){ 002627 sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap); 002628 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader); 002629 } 002630 if( rc!=SQLITE_OK ){ 002631 goto btree_open_out; 002632 } 002633 pBt->openFlags = (u8)flags; 002634 pBt->db = db; 002635 sqlite3PagerSetBusyHandler(pBt->pPager, btreeInvokeBusyHandler, pBt); 002636 p->pBt = pBt; 002637 002638 pBt->pCursor = 0; 002639 pBt->pPage1 = 0; 002640 if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY; 002641 #if defined(SQLITE_SECURE_DELETE) 002642 pBt->btsFlags |= BTS_SECURE_DELETE; 002643 #elif defined(SQLITE_FAST_SECURE_DELETE) 002644 pBt->btsFlags |= BTS_OVERWRITE; 002645 #endif 002646 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is 002647 ** determined by the 2-byte integer located at an offset of 16 bytes from 002648 ** the beginning of the database file. */ 002649 pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16); 002650 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE 002651 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ 002652 pBt->pageSize = 0; 002653 #ifndef SQLITE_OMIT_AUTOVACUUM 002654 /* If the magic name ":memory:" will create an in-memory database, then 002655 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if 002656 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if 002657 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a 002658 ** regular file-name. In this case the auto-vacuum applies as per normal. 002659 */ 002660 if( zFilename && !isMemdb ){ 002661 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0); 002662 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0); 002663 } 002664 #endif 002665 nReserve = 0; 002666 }else{ 002667 /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is 002668 ** determined by the one-byte unsigned integer found at an offset of 20 002669 ** into the database file header. */ 002670 nReserve = zDbHeader[20]; 002671 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 002672 #ifndef SQLITE_OMIT_AUTOVACUUM 002673 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0); 002674 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0); 002675 #endif 002676 } 002677 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); 002678 if( rc ) goto btree_open_out; 002679 pBt->usableSize = pBt->pageSize - nReserve; 002680 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */ 002681 002682 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 002683 /* Add the new BtShared object to the linked list sharable BtShareds. 002684 */ 002685 pBt->nRef = 1; 002686 if( p->sharable ){ 002687 MUTEX_LOGIC( sqlite3_mutex *mutexShared; ) 002688 MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN);) 002689 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){ 002690 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST); 002691 if( pBt->mutex==0 ){ 002692 rc = SQLITE_NOMEM_BKPT; 002693 goto btree_open_out; 002694 } 002695 } 002696 sqlite3_mutex_enter(mutexShared); 002697 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList); 002698 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt; 002699 sqlite3_mutex_leave(mutexShared); 002700 } 002701 #endif 002702 } 002703 002704 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 002705 /* If the new Btree uses a sharable pBtShared, then link the new 002706 ** Btree into the list of all sharable Btrees for the same connection. 002707 ** The list is kept in ascending order by pBt address. 002708 */ 002709 if( p->sharable ){ 002710 int i; 002711 Btree *pSib; 002712 for(i=0; i<db->nDb; i++){ 002713 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){ 002714 while( pSib->pPrev ){ pSib = pSib->pPrev; } 002715 if( (uptr)p->pBt<(uptr)pSib->pBt ){ 002716 p->pNext = pSib; 002717 p->pPrev = 0; 002718 pSib->pPrev = p; 002719 }else{ 002720 while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){ 002721 pSib = pSib->pNext; 002722 } 002723 p->pNext = pSib->pNext; 002724 p->pPrev = pSib; 002725 if( p->pNext ){ 002726 p->pNext->pPrev = p; 002727 } 002728 pSib->pNext = p; 002729 } 002730 break; 002731 } 002732 } 002733 } 002734 #endif 002735 *ppBtree = p; 002736 002737 btree_open_out: 002738 if( rc!=SQLITE_OK ){ 002739 if( pBt && pBt->pPager ){ 002740 sqlite3PagerClose(pBt->pPager, 0); 002741 } 002742 sqlite3_free(pBt); 002743 sqlite3_free(p); 002744 *ppBtree = 0; 002745 }else{ 002746 sqlite3_file *pFile; 002747 002748 /* If the B-Tree was successfully opened, set the pager-cache size to the 002749 ** default value. Except, when opening on an existing shared pager-cache, 002750 ** do not change the pager-cache size. 002751 */ 002752 if( sqlite3BtreeSchema(p, 0, 0)==0 ){ 002753 sqlite3BtreeSetCacheSize(p, SQLITE_DEFAULT_CACHE_SIZE); 002754 } 002755 002756 pFile = sqlite3PagerFile(pBt->pPager); 002757 if( pFile->pMethods ){ 002758 sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db); 002759 } 002760 } 002761 if( mutexOpen ){ 002762 assert( sqlite3_mutex_held(mutexOpen) ); 002763 sqlite3_mutex_leave(mutexOpen); 002764 } 002765 assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 ); 002766 return rc; 002767 } 002768 002769 /* 002770 ** Decrement the BtShared.nRef counter. When it reaches zero, 002771 ** remove the BtShared structure from the sharing list. Return 002772 ** true if the BtShared.nRef counter reaches zero and return 002773 ** false if it is still positive. 002774 */ 002775 static int removeFromSharingList(BtShared *pBt){ 002776 #ifndef SQLITE_OMIT_SHARED_CACHE 002777 MUTEX_LOGIC( sqlite3_mutex *pMainMtx; ) 002778 BtShared *pList; 002779 int removed = 0; 002780 002781 assert( sqlite3_mutex_notheld(pBt->mutex) ); 002782 MUTEX_LOGIC( pMainMtx = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN); ) 002783 sqlite3_mutex_enter(pMainMtx); 002784 pBt->nRef--; 002785 if( pBt->nRef<=0 ){ 002786 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){ 002787 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext; 002788 }else{ 002789 pList = GLOBAL(BtShared*,sqlite3SharedCacheList); 002790 while( ALWAYS(pList) && pList->pNext!=pBt ){ 002791 pList=pList->pNext; 002792 } 002793 if( ALWAYS(pList) ){ 002794 pList->pNext = pBt->pNext; 002795 } 002796 } 002797 if( SQLITE_THREADSAFE ){ 002798 sqlite3_mutex_free(pBt->mutex); 002799 } 002800 removed = 1; 002801 } 002802 sqlite3_mutex_leave(pMainMtx); 002803 return removed; 002804 #else 002805 return 1; 002806 #endif 002807 } 002808 002809 /* 002810 ** Make sure pBt->pTmpSpace points to an allocation of 002811 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child 002812 ** pointer. 002813 */ 002814 static SQLITE_NOINLINE int allocateTempSpace(BtShared *pBt){ 002815 assert( pBt!=0 ); 002816 assert( pBt->pTmpSpace==0 ); 002817 /* This routine is called only by btreeCursor() when allocating the 002818 ** first write cursor for the BtShared object */ 002819 assert( pBt->pCursor!=0 && (pBt->pCursor->curFlags & BTCF_WriteFlag)!=0 ); 002820 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize ); 002821 if( pBt->pTmpSpace==0 ){ 002822 BtCursor *pCur = pBt->pCursor; 002823 pBt->pCursor = pCur->pNext; /* Unlink the cursor */ 002824 memset(pCur, 0, sizeof(*pCur)); 002825 return SQLITE_NOMEM_BKPT; 002826 } 002827 002828 /* One of the uses of pBt->pTmpSpace is to format cells before 002829 ** inserting them into a leaf page (function fillInCell()). If 002830 ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes 002831 ** by the various routines that manipulate binary cells. Which 002832 ** can mean that fillInCell() only initializes the first 2 or 3 002833 ** bytes of pTmpSpace, but that the first 4 bytes are copied from 002834 ** it into a database page. This is not actually a problem, but it 002835 ** does cause a valgrind error when the 1 or 2 bytes of uninitialized 002836 ** data is passed to system call write(). So to avoid this error, 002837 ** zero the first 4 bytes of temp space here. 002838 ** 002839 ** Also: Provide four bytes of initialized space before the 002840 ** beginning of pTmpSpace as an area available to prepend the 002841 ** left-child pointer to the beginning of a cell. 002842 */ 002843 memset(pBt->pTmpSpace, 0, 8); 002844 pBt->pTmpSpace += 4; 002845 return SQLITE_OK; 002846 } 002847 002848 /* 002849 ** Free the pBt->pTmpSpace allocation 002850 */ 002851 static void freeTempSpace(BtShared *pBt){ 002852 if( pBt->pTmpSpace ){ 002853 pBt->pTmpSpace -= 4; 002854 sqlite3PageFree(pBt->pTmpSpace); 002855 pBt->pTmpSpace = 0; 002856 } 002857 } 002858 002859 /* 002860 ** Close an open database and invalidate all cursors. 002861 */ 002862 int sqlite3BtreeClose(Btree *p){ 002863 BtShared *pBt = p->pBt; 002864 002865 /* Close all cursors opened via this handle. */ 002866 assert( sqlite3_mutex_held(p->db->mutex) ); 002867 sqlite3BtreeEnter(p); 002868 002869 /* Verify that no other cursors have this Btree open */ 002870 #ifdef SQLITE_DEBUG 002871 { 002872 BtCursor *pCur = pBt->pCursor; 002873 while( pCur ){ 002874 BtCursor *pTmp = pCur; 002875 pCur = pCur->pNext; 002876 assert( pTmp->pBtree!=p ); 002877 002878 } 002879 } 002880 #endif 002881 002882 /* Rollback any active transaction and free the handle structure. 002883 ** The call to sqlite3BtreeRollback() drops any table-locks held by 002884 ** this handle. 002885 */ 002886 sqlite3BtreeRollback(p, SQLITE_OK, 0); 002887 sqlite3BtreeLeave(p); 002888 002889 /* If there are still other outstanding references to the shared-btree 002890 ** structure, return now. The remainder of this procedure cleans 002891 ** up the shared-btree. 002892 */ 002893 assert( p->wantToLock==0 && p->locked==0 ); 002894 if( !p->sharable || removeFromSharingList(pBt) ){ 002895 /* The pBt is no longer on the sharing list, so we can access 002896 ** it without having to hold the mutex. 002897 ** 002898 ** Clean out and delete the BtShared object. 002899 */ 002900 assert( !pBt->pCursor ); 002901 sqlite3PagerClose(pBt->pPager, p->db); 002902 if( pBt->xFreeSchema && pBt->pSchema ){ 002903 pBt->xFreeSchema(pBt->pSchema); 002904 } 002905 sqlite3DbFree(0, pBt->pSchema); 002906 freeTempSpace(pBt); 002907 sqlite3_free(pBt); 002908 } 002909 002910 #ifndef SQLITE_OMIT_SHARED_CACHE 002911 assert( p->wantToLock==0 ); 002912 assert( p->locked==0 ); 002913 if( p->pPrev ) p->pPrev->pNext = p->pNext; 002914 if( p->pNext ) p->pNext->pPrev = p->pPrev; 002915 #endif 002916 002917 sqlite3_free(p); 002918 return SQLITE_OK; 002919 } 002920 002921 /* 002922 ** Change the "soft" limit on the number of pages in the cache. 002923 ** Unused and unmodified pages will be recycled when the number of 002924 ** pages in the cache exceeds this soft limit. But the size of the 002925 ** cache is allowed to grow larger than this limit if it contains 002926 ** dirty pages or pages still in active use. 002927 */ 002928 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){ 002929 BtShared *pBt = p->pBt; 002930 assert( sqlite3_mutex_held(p->db->mutex) ); 002931 sqlite3BtreeEnter(p); 002932 sqlite3PagerSetCachesize(pBt->pPager, mxPage); 002933 sqlite3BtreeLeave(p); 002934 return SQLITE_OK; 002935 } 002936 002937 /* 002938 ** Change the "spill" limit on the number of pages in the cache. 002939 ** If the number of pages exceeds this limit during a write transaction, 002940 ** the pager might attempt to "spill" pages to the journal early in 002941 ** order to free up memory. 002942 ** 002943 ** The value returned is the current spill size. If zero is passed 002944 ** as an argument, no changes are made to the spill size setting, so 002945 ** using mxPage of 0 is a way to query the current spill size. 002946 */ 002947 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){ 002948 BtShared *pBt = p->pBt; 002949 int res; 002950 assert( sqlite3_mutex_held(p->db->mutex) ); 002951 sqlite3BtreeEnter(p); 002952 res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage); 002953 sqlite3BtreeLeave(p); 002954 return res; 002955 } 002956 002957 #if SQLITE_MAX_MMAP_SIZE>0 002958 /* 002959 ** Change the limit on the amount of the database file that may be 002960 ** memory mapped. 002961 */ 002962 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){ 002963 BtShared *pBt = p->pBt; 002964 assert( sqlite3_mutex_held(p->db->mutex) ); 002965 sqlite3BtreeEnter(p); 002966 sqlite3PagerSetMmapLimit(pBt->pPager, szMmap); 002967 sqlite3BtreeLeave(p); 002968 return SQLITE_OK; 002969 } 002970 #endif /* SQLITE_MAX_MMAP_SIZE>0 */ 002971 002972 /* 002973 ** Change the way data is synced to disk in order to increase or decrease 002974 ** how well the database resists damage due to OS crashes and power 002975 ** failures. Level 1 is the same as asynchronous (no syncs() occur and 002976 ** there is a high probability of damage) Level 2 is the default. There 002977 ** is a very low but non-zero probability of damage. Level 3 reduces the 002978 ** probability of damage to near zero but with a write performance reduction. 002979 */ 002980 #ifndef SQLITE_OMIT_PAGER_PRAGMAS 002981 int sqlite3BtreeSetPagerFlags( 002982 Btree *p, /* The btree to set the safety level on */ 002983 unsigned pgFlags /* Various PAGER_* flags */ 002984 ){ 002985 BtShared *pBt = p->pBt; 002986 assert( sqlite3_mutex_held(p->db->mutex) ); 002987 sqlite3BtreeEnter(p); 002988 sqlite3PagerSetFlags(pBt->pPager, pgFlags); 002989 sqlite3BtreeLeave(p); 002990 return SQLITE_OK; 002991 } 002992 #endif 002993 002994 /* 002995 ** Change the default pages size and the number of reserved bytes per page. 002996 ** Or, if the page size has already been fixed, return SQLITE_READONLY 002997 ** without changing anything. 002998 ** 002999 ** The page size must be a power of 2 between 512 and 65536. If the page 003000 ** size supplied does not meet this constraint then the page size is not 003001 ** changed. 003002 ** 003003 ** Page sizes are constrained to be a power of two so that the region 003004 ** of the database file used for locking (beginning at PENDING_BYTE, 003005 ** the first byte past the 1GB boundary, 0x40000000) needs to occur 003006 ** at the beginning of a page. 003007 ** 003008 ** If parameter nReserve is less than zero, then the number of reserved 003009 ** bytes per page is left unchanged. 003010 ** 003011 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size 003012 ** and autovacuum mode can no longer be changed. 003013 */ 003014 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){ 003015 int rc = SQLITE_OK; 003016 int x; 003017 BtShared *pBt = p->pBt; 003018 assert( nReserve>=0 && nReserve<=255 ); 003019 sqlite3BtreeEnter(p); 003020 pBt->nReserveWanted = nReserve; 003021 x = pBt->pageSize - pBt->usableSize; 003022 if( nReserve<x ) nReserve = x; 003023 if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){ 003024 sqlite3BtreeLeave(p); 003025 return SQLITE_READONLY; 003026 } 003027 assert( nReserve>=0 && nReserve<=255 ); 003028 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE && 003029 ((pageSize-1)&pageSize)==0 ){ 003030 assert( (pageSize & 7)==0 ); 003031 assert( !pBt->pCursor ); 003032 if( nReserve>32 && pageSize==512 ) pageSize = 1024; 003033 pBt->pageSize = (u32)pageSize; 003034 freeTempSpace(pBt); 003035 } 003036 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); 003037 pBt->usableSize = pBt->pageSize - (u16)nReserve; 003038 if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003039 sqlite3BtreeLeave(p); 003040 return rc; 003041 } 003042 003043 /* 003044 ** Return the currently defined page size 003045 */ 003046 int sqlite3BtreeGetPageSize(Btree *p){ 003047 return p->pBt->pageSize; 003048 } 003049 003050 /* 003051 ** This function is similar to sqlite3BtreeGetReserve(), except that it 003052 ** may only be called if it is guaranteed that the b-tree mutex is already 003053 ** held. 003054 ** 003055 ** This is useful in one special case in the backup API code where it is 003056 ** known that the shared b-tree mutex is held, but the mutex on the 003057 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter() 003058 ** were to be called, it might collide with some other operation on the 003059 ** database handle that owns *p, causing undefined behavior. 003060 */ 003061 int sqlite3BtreeGetReserveNoMutex(Btree *p){ 003062 int n; 003063 assert( sqlite3_mutex_held(p->pBt->mutex) ); 003064 n = p->pBt->pageSize - p->pBt->usableSize; 003065 return n; 003066 } 003067 003068 /* 003069 ** Return the number of bytes of space at the end of every page that 003070 ** are intentionally left unused. This is the "reserved" space that is 003071 ** sometimes used by extensions. 003072 ** 003073 ** The value returned is the larger of the current reserve size and 003074 ** the latest reserve size requested by SQLITE_FILECTRL_RESERVE_BYTES. 003075 ** The amount of reserve can only grow - never shrink. 003076 */ 003077 int sqlite3BtreeGetRequestedReserve(Btree *p){ 003078 int n1, n2; 003079 sqlite3BtreeEnter(p); 003080 n1 = (int)p->pBt->nReserveWanted; 003081 n2 = sqlite3BtreeGetReserveNoMutex(p); 003082 sqlite3BtreeLeave(p); 003083 return n1>n2 ? n1 : n2; 003084 } 003085 003086 003087 /* 003088 ** Set the maximum page count for a database if mxPage is positive. 003089 ** No changes are made if mxPage is 0 or negative. 003090 ** Regardless of the value of mxPage, return the maximum page count. 003091 */ 003092 Pgno sqlite3BtreeMaxPageCount(Btree *p, Pgno mxPage){ 003093 Pgno n; 003094 sqlite3BtreeEnter(p); 003095 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage); 003096 sqlite3BtreeLeave(p); 003097 return n; 003098 } 003099 003100 /* 003101 ** Change the values for the BTS_SECURE_DELETE and BTS_OVERWRITE flags: 003102 ** 003103 ** newFlag==0 Both BTS_SECURE_DELETE and BTS_OVERWRITE are cleared 003104 ** newFlag==1 BTS_SECURE_DELETE set and BTS_OVERWRITE is cleared 003105 ** newFlag==2 BTS_SECURE_DELETE cleared and BTS_OVERWRITE is set 003106 ** newFlag==(-1) No changes 003107 ** 003108 ** This routine acts as a query if newFlag is less than zero 003109 ** 003110 ** With BTS_OVERWRITE set, deleted content is overwritten by zeros, but 003111 ** freelist leaf pages are not written back to the database. Thus in-page 003112 ** deleted content is cleared, but freelist deleted content is not. 003113 ** 003114 ** With BTS_SECURE_DELETE, operation is like BTS_OVERWRITE with the addition 003115 ** that freelist leaf pages are written back into the database, increasing 003116 ** the amount of disk I/O. 003117 */ 003118 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){ 003119 int b; 003120 if( p==0 ) return 0; 003121 sqlite3BtreeEnter(p); 003122 assert( BTS_OVERWRITE==BTS_SECURE_DELETE*2 ); 003123 assert( BTS_FAST_SECURE==(BTS_OVERWRITE|BTS_SECURE_DELETE) ); 003124 if( newFlag>=0 ){ 003125 p->pBt->btsFlags &= ~BTS_FAST_SECURE; 003126 p->pBt->btsFlags |= BTS_SECURE_DELETE*newFlag; 003127 } 003128 b = (p->pBt->btsFlags & BTS_FAST_SECURE)/BTS_SECURE_DELETE; 003129 sqlite3BtreeLeave(p); 003130 return b; 003131 } 003132 003133 /* 003134 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum' 003135 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it 003136 ** is disabled. The default value for the auto-vacuum property is 003137 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro. 003138 */ 003139 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ 003140 #ifdef SQLITE_OMIT_AUTOVACUUM 003141 return SQLITE_READONLY; 003142 #else 003143 BtShared *pBt = p->pBt; 003144 int rc = SQLITE_OK; 003145 u8 av = (u8)autoVacuum; 003146 003147 sqlite3BtreeEnter(p); 003148 if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){ 003149 rc = SQLITE_READONLY; 003150 }else{ 003151 pBt->autoVacuum = av ?1:0; 003152 pBt->incrVacuum = av==2 ?1:0; 003153 } 003154 sqlite3BtreeLeave(p); 003155 return rc; 003156 #endif 003157 } 003158 003159 /* 003160 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 003161 ** enabled 1 is returned. Otherwise 0. 003162 */ 003163 int sqlite3BtreeGetAutoVacuum(Btree *p){ 003164 #ifdef SQLITE_OMIT_AUTOVACUUM 003165 return BTREE_AUTOVACUUM_NONE; 003166 #else 003167 int rc; 003168 sqlite3BtreeEnter(p); 003169 rc = ( 003170 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE: 003171 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL: 003172 BTREE_AUTOVACUUM_INCR 003173 ); 003174 sqlite3BtreeLeave(p); 003175 return rc; 003176 #endif 003177 } 003178 003179 /* 003180 ** If the user has not set the safety-level for this database connection 003181 ** using "PRAGMA synchronous", and if the safety-level is not already 003182 ** set to the value passed to this function as the second parameter, 003183 ** set it so. 003184 */ 003185 #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS \ 003186 && !defined(SQLITE_OMIT_WAL) 003187 static void setDefaultSyncFlag(BtShared *pBt, u8 safety_level){ 003188 sqlite3 *db; 003189 Db *pDb; 003190 if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){ 003191 while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; } 003192 if( pDb->bSyncSet==0 003193 && pDb->safety_level!=safety_level 003194 && pDb!=&db->aDb[1] 003195 ){ 003196 pDb->safety_level = safety_level; 003197 sqlite3PagerSetFlags(pBt->pPager, 003198 pDb->safety_level | (db->flags & PAGER_FLAGS_MASK)); 003199 } 003200 } 003201 } 003202 #else 003203 # define setDefaultSyncFlag(pBt,safety_level) 003204 #endif 003205 003206 /* Forward declaration */ 003207 static int newDatabase(BtShared*); 003208 003209 003210 /* 003211 ** Get a reference to pPage1 of the database file. This will 003212 ** also acquire a readlock on that file. 003213 ** 003214 ** SQLITE_OK is returned on success. If the file is not a 003215 ** well-formed database file, then SQLITE_CORRUPT is returned. 003216 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM 003217 ** is returned if we run out of memory. 003218 */ 003219 static int lockBtree(BtShared *pBt){ 003220 int rc; /* Result code from subfunctions */ 003221 MemPage *pPage1; /* Page 1 of the database file */ 003222 u32 nPage; /* Number of pages in the database */ 003223 u32 nPageFile = 0; /* Number of pages in the database file */ 003224 003225 assert( sqlite3_mutex_held(pBt->mutex) ); 003226 assert( pBt->pPage1==0 ); 003227 rc = sqlite3PagerSharedLock(pBt->pPager); 003228 if( rc!=SQLITE_OK ) return rc; 003229 rc = btreeGetPage(pBt, 1, &pPage1, 0); 003230 if( rc!=SQLITE_OK ) return rc; 003231 003232 /* Do some checking to help insure the file we opened really is 003233 ** a valid database file. 003234 */ 003235 nPage = get4byte(28+(u8*)pPage1->aData); 003236 sqlite3PagerPagecount(pBt->pPager, (int*)&nPageFile); 003237 if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){ 003238 nPage = nPageFile; 003239 } 003240 if( (pBt->db->flags & SQLITE_ResetDatabase)!=0 ){ 003241 nPage = 0; 003242 } 003243 if( nPage>0 ){ 003244 u32 pageSize; 003245 u32 usableSize; 003246 u8 *page1 = pPage1->aData; 003247 rc = SQLITE_NOTADB; 003248 /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins 003249 ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d 003250 ** 61 74 20 33 00. */ 003251 if( memcmp(page1, zMagicHeader, 16)!=0 ){ 003252 goto page1_init_failed; 003253 } 003254 003255 #ifdef SQLITE_OMIT_WAL 003256 if( page1[18]>1 ){ 003257 pBt->btsFlags |= BTS_READ_ONLY; 003258 } 003259 if( page1[19]>1 ){ 003260 goto page1_init_failed; 003261 } 003262 #else 003263 if( page1[18]>2 ){ 003264 pBt->btsFlags |= BTS_READ_ONLY; 003265 } 003266 if( page1[19]>2 ){ 003267 goto page1_init_failed; 003268 } 003269 003270 /* If the read version is set to 2, this database should be accessed 003271 ** in WAL mode. If the log is not already open, open it now. Then 003272 ** return SQLITE_OK and return without populating BtShared.pPage1. 003273 ** The caller detects this and calls this function again. This is 003274 ** required as the version of page 1 currently in the page1 buffer 003275 ** may not be the latest version - there may be a newer one in the log 003276 ** file. 003277 */ 003278 if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){ 003279 int isOpen = 0; 003280 rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen); 003281 if( rc!=SQLITE_OK ){ 003282 goto page1_init_failed; 003283 }else{ 003284 setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1); 003285 if( isOpen==0 ){ 003286 releasePageOne(pPage1); 003287 return SQLITE_OK; 003288 } 003289 } 003290 rc = SQLITE_NOTADB; 003291 }else{ 003292 setDefaultSyncFlag(pBt, SQLITE_DEFAULT_SYNCHRONOUS+1); 003293 } 003294 #endif 003295 003296 /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload 003297 ** fractions and the leaf payload fraction values must be 64, 32, and 32. 003298 ** 003299 ** The original design allowed these amounts to vary, but as of 003300 ** version 3.6.0, we require them to be fixed. 003301 */ 003302 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){ 003303 goto page1_init_failed; 003304 } 003305 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is 003306 ** determined by the 2-byte integer located at an offset of 16 bytes from 003307 ** the beginning of the database file. */ 003308 pageSize = (page1[16]<<8) | (page1[17]<<16); 003309 /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two 003310 ** between 512 and 65536 inclusive. */ 003311 if( ((pageSize-1)&pageSize)!=0 003312 || pageSize>SQLITE_MAX_PAGE_SIZE 003313 || pageSize<=256 003314 ){ 003315 goto page1_init_failed; 003316 } 003317 assert( (pageSize & 7)==0 ); 003318 /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte 003319 ** integer at offset 20 is the number of bytes of space at the end of 003320 ** each page to reserve for extensions. 003321 ** 003322 ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is 003323 ** determined by the one-byte unsigned integer found at an offset of 20 003324 ** into the database file header. */ 003325 usableSize = pageSize - page1[20]; 003326 if( (u32)pageSize!=pBt->pageSize ){ 003327 /* After reading the first page of the database assuming a page size 003328 ** of BtShared.pageSize, we have discovered that the page-size is 003329 ** actually pageSize. Unlock the database, leave pBt->pPage1 at 003330 ** zero and return SQLITE_OK. The caller will call this function 003331 ** again with the correct page-size. 003332 */ 003333 releasePageOne(pPage1); 003334 pBt->usableSize = usableSize; 003335 pBt->pageSize = pageSize; 003336 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003337 freeTempSpace(pBt); 003338 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, 003339 pageSize-usableSize); 003340 return rc; 003341 } 003342 if( nPage>nPageFile ){ 003343 if( sqlite3WritableSchema(pBt->db)==0 ){ 003344 rc = SQLITE_CORRUPT_BKPT; 003345 goto page1_init_failed; 003346 }else{ 003347 nPage = nPageFile; 003348 } 003349 } 003350 /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to 003351 ** be less than 480. In other words, if the page size is 512, then the 003352 ** reserved space size cannot exceed 32. */ 003353 if( usableSize<480 ){ 003354 goto page1_init_failed; 003355 } 003356 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003357 pBt->pageSize = pageSize; 003358 pBt->usableSize = usableSize; 003359 #ifndef SQLITE_OMIT_AUTOVACUUM 003360 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0); 003361 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0); 003362 #endif 003363 } 003364 003365 /* maxLocal is the maximum amount of payload to store locally for 003366 ** a cell. Make sure it is small enough so that at least minFanout 003367 ** cells can will fit on one page. We assume a 10-byte page header. 003368 ** Besides the payload, the cell must store: 003369 ** 2-byte pointer to the cell 003370 ** 4-byte child pointer 003371 ** 9-byte nKey value 003372 ** 4-byte nData value 003373 ** 4-byte overflow page pointer 003374 ** So a cell consists of a 2-byte pointer, a header which is as much as 003375 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow 003376 ** page pointer. 003377 */ 003378 pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23); 003379 pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23); 003380 pBt->maxLeaf = (u16)(pBt->usableSize - 35); 003381 pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23); 003382 if( pBt->maxLocal>127 ){ 003383 pBt->max1bytePayload = 127; 003384 }else{ 003385 pBt->max1bytePayload = (u8)pBt->maxLocal; 003386 } 003387 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) ); 003388 pBt->pPage1 = pPage1; 003389 pBt->nPage = nPage; 003390 return SQLITE_OK; 003391 003392 page1_init_failed: 003393 releasePageOne(pPage1); 003394 pBt->pPage1 = 0; 003395 return rc; 003396 } 003397 003398 #ifndef NDEBUG 003399 /* 003400 ** Return the number of cursors open on pBt. This is for use 003401 ** in assert() expressions, so it is only compiled if NDEBUG is not 003402 ** defined. 003403 ** 003404 ** Only write cursors are counted if wrOnly is true. If wrOnly is 003405 ** false then all cursors are counted. 003406 ** 003407 ** For the purposes of this routine, a cursor is any cursor that 003408 ** is capable of reading or writing to the database. Cursors that 003409 ** have been tripped into the CURSOR_FAULT state are not counted. 003410 */ 003411 static int countValidCursors(BtShared *pBt, int wrOnly){ 003412 BtCursor *pCur; 003413 int r = 0; 003414 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 003415 if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0) 003416 && pCur->eState!=CURSOR_FAULT ) r++; 003417 } 003418 return r; 003419 } 003420 #endif 003421 003422 /* 003423 ** If there are no outstanding cursors and we are not in the middle 003424 ** of a transaction but there is a read lock on the database, then 003425 ** this routine unrefs the first page of the database file which 003426 ** has the effect of releasing the read lock. 003427 ** 003428 ** If there is a transaction in progress, this routine is a no-op. 003429 */ 003430 static void unlockBtreeIfUnused(BtShared *pBt){ 003431 assert( sqlite3_mutex_held(pBt->mutex) ); 003432 assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE ); 003433 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){ 003434 MemPage *pPage1 = pBt->pPage1; 003435 assert( pPage1->aData ); 003436 assert( sqlite3PagerRefcount(pBt->pPager)==1 ); 003437 pBt->pPage1 = 0; 003438 releasePageOne(pPage1); 003439 } 003440 } 003441 003442 /* 003443 ** If pBt points to an empty file then convert that empty file 003444 ** into a new empty database by initializing the first page of 003445 ** the database. 003446 */ 003447 static int newDatabase(BtShared *pBt){ 003448 MemPage *pP1; 003449 unsigned char *data; 003450 int rc; 003451 003452 assert( sqlite3_mutex_held(pBt->mutex) ); 003453 if( pBt->nPage>0 ){ 003454 return SQLITE_OK; 003455 } 003456 pP1 = pBt->pPage1; 003457 assert( pP1!=0 ); 003458 data = pP1->aData; 003459 rc = sqlite3PagerWrite(pP1->pDbPage); 003460 if( rc ) return rc; 003461 memcpy(data, zMagicHeader, sizeof(zMagicHeader)); 003462 assert( sizeof(zMagicHeader)==16 ); 003463 data[16] = (u8)((pBt->pageSize>>8)&0xff); 003464 data[17] = (u8)((pBt->pageSize>>16)&0xff); 003465 data[18] = 1; 003466 data[19] = 1; 003467 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize); 003468 data[20] = (u8)(pBt->pageSize - pBt->usableSize); 003469 data[21] = 64; 003470 data[22] = 32; 003471 data[23] = 32; 003472 memset(&data[24], 0, 100-24); 003473 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA ); 003474 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003475 #ifndef SQLITE_OMIT_AUTOVACUUM 003476 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 ); 003477 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 ); 003478 put4byte(&data[36 + 4*4], pBt->autoVacuum); 003479 put4byte(&data[36 + 7*4], pBt->incrVacuum); 003480 #endif 003481 pBt->nPage = 1; 003482 data[31] = 1; 003483 return SQLITE_OK; 003484 } 003485 003486 /* 003487 ** Initialize the first page of the database file (creating a database 003488 ** consisting of a single page and no schema objects). Return SQLITE_OK 003489 ** if successful, or an SQLite error code otherwise. 003490 */ 003491 int sqlite3BtreeNewDb(Btree *p){ 003492 int rc; 003493 sqlite3BtreeEnter(p); 003494 p->pBt->nPage = 0; 003495 rc = newDatabase(p->pBt); 003496 sqlite3BtreeLeave(p); 003497 return rc; 003498 } 003499 003500 /* 003501 ** Attempt to start a new transaction. A write-transaction 003502 ** is started if the second argument is nonzero, otherwise a read- 003503 ** transaction. If the second argument is 2 or more and exclusive 003504 ** transaction is started, meaning that no other process is allowed 003505 ** to access the database. A preexisting transaction may not be 003506 ** upgraded to exclusive by calling this routine a second time - the 003507 ** exclusivity flag only works for a new transaction. 003508 ** 003509 ** A write-transaction must be started before attempting any 003510 ** changes to the database. None of the following routines 003511 ** will work unless a transaction is started first: 003512 ** 003513 ** sqlite3BtreeCreateTable() 003514 ** sqlite3BtreeCreateIndex() 003515 ** sqlite3BtreeClearTable() 003516 ** sqlite3BtreeDropTable() 003517 ** sqlite3BtreeInsert() 003518 ** sqlite3BtreeDelete() 003519 ** sqlite3BtreeUpdateMeta() 003520 ** 003521 ** If an initial attempt to acquire the lock fails because of lock contention 003522 ** and the database was previously unlocked, then invoke the busy handler 003523 ** if there is one. But if there was previously a read-lock, do not 003524 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is 003525 ** returned when there is already a read-lock in order to avoid a deadlock. 003526 ** 003527 ** Suppose there are two processes A and B. A has a read lock and B has 003528 ** a reserved lock. B tries to promote to exclusive but is blocked because 003529 ** of A's read lock. A tries to promote to reserved but is blocked by B. 003530 ** One or the other of the two processes must give way or there can be 003531 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback 003532 ** when A already has a read lock, we encourage A to give up and let B 003533 ** proceed. 003534 */ 003535 static SQLITE_NOINLINE int btreeBeginTrans( 003536 Btree *p, /* The btree in which to start the transaction */ 003537 int wrflag, /* True to start a write transaction */ 003538 int *pSchemaVersion /* Put schema version number here, if not NULL */ 003539 ){ 003540 BtShared *pBt = p->pBt; 003541 Pager *pPager = pBt->pPager; 003542 int rc = SQLITE_OK; 003543 003544 sqlite3BtreeEnter(p); 003545 btreeIntegrity(p); 003546 003547 /* If the btree is already in a write-transaction, or it 003548 ** is already in a read-transaction and a read-transaction 003549 ** is requested, this is a no-op. 003550 */ 003551 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){ 003552 goto trans_begun; 003553 } 003554 assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 ); 003555 003556 if( (p->db->flags & SQLITE_ResetDatabase) 003557 && sqlite3PagerIsreadonly(pPager)==0 003558 ){ 003559 pBt->btsFlags &= ~BTS_READ_ONLY; 003560 } 003561 003562 /* Write transactions are not possible on a read-only database */ 003563 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){ 003564 rc = SQLITE_READONLY; 003565 goto trans_begun; 003566 } 003567 003568 #ifndef SQLITE_OMIT_SHARED_CACHE 003569 { 003570 sqlite3 *pBlock = 0; 003571 /* If another database handle has already opened a write transaction 003572 ** on this shared-btree structure and a second write transaction is 003573 ** requested, return SQLITE_LOCKED. 003574 */ 003575 if( (wrflag && pBt->inTransaction==TRANS_WRITE) 003576 || (pBt->btsFlags & BTS_PENDING)!=0 003577 ){ 003578 pBlock = pBt->pWriter->db; 003579 }else if( wrflag>1 ){ 003580 BtLock *pIter; 003581 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 003582 if( pIter->pBtree!=p ){ 003583 pBlock = pIter->pBtree->db; 003584 break; 003585 } 003586 } 003587 } 003588 if( pBlock ){ 003589 sqlite3ConnectionBlocked(p->db, pBlock); 003590 rc = SQLITE_LOCKED_SHAREDCACHE; 003591 goto trans_begun; 003592 } 003593 } 003594 #endif 003595 003596 /* Any read-only or read-write transaction implies a read-lock on 003597 ** page 1. So if some other shared-cache client already has a write-lock 003598 ** on page 1, the transaction cannot be opened. */ 003599 rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK); 003600 if( SQLITE_OK!=rc ) goto trans_begun; 003601 003602 pBt->btsFlags &= ~BTS_INITIALLY_EMPTY; 003603 if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY; 003604 do { 003605 sqlite3PagerWalDb(pPager, p->db); 003606 003607 #ifdef SQLITE_ENABLE_SETLK_TIMEOUT 003608 /* If transitioning from no transaction directly to a write transaction, 003609 ** block for the WRITER lock first if possible. */ 003610 if( pBt->pPage1==0 && wrflag ){ 003611 assert( pBt->inTransaction==TRANS_NONE ); 003612 rc = sqlite3PagerWalWriteLock(pPager, 1); 003613 if( rc!=SQLITE_BUSY && rc!=SQLITE_OK ) break; 003614 } 003615 #endif 003616 003617 /* Call lockBtree() until either pBt->pPage1 is populated or 003618 ** lockBtree() returns something other than SQLITE_OK. lockBtree() 003619 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after 003620 ** reading page 1 it discovers that the page-size of the database 003621 ** file is not pBt->pageSize. In this case lockBtree() will update 003622 ** pBt->pageSize to the page-size of the file on disk. 003623 */ 003624 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) ); 003625 003626 if( rc==SQLITE_OK && wrflag ){ 003627 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){ 003628 rc = SQLITE_READONLY; 003629 }else{ 003630 rc = sqlite3PagerBegin(pPager, wrflag>1, sqlite3TempInMemory(p->db)); 003631 if( rc==SQLITE_OK ){ 003632 rc = newDatabase(pBt); 003633 }else if( rc==SQLITE_BUSY_SNAPSHOT && pBt->inTransaction==TRANS_NONE ){ 003634 /* if there was no transaction opened when this function was 003635 ** called and SQLITE_BUSY_SNAPSHOT is returned, change the error 003636 ** code to SQLITE_BUSY. */ 003637 rc = SQLITE_BUSY; 003638 } 003639 } 003640 } 003641 003642 if( rc!=SQLITE_OK ){ 003643 (void)sqlite3PagerWalWriteLock(pPager, 0); 003644 unlockBtreeIfUnused(pBt); 003645 } 003646 }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && 003647 btreeInvokeBusyHandler(pBt) ); 003648 sqlite3PagerWalDb(pPager, 0); 003649 #ifdef SQLITE_ENABLE_SETLK_TIMEOUT 003650 if( rc==SQLITE_BUSY_TIMEOUT ) rc = SQLITE_BUSY; 003651 #endif 003652 003653 if( rc==SQLITE_OK ){ 003654 if( p->inTrans==TRANS_NONE ){ 003655 pBt->nTransaction++; 003656 #ifndef SQLITE_OMIT_SHARED_CACHE 003657 if( p->sharable ){ 003658 assert( p->lock.pBtree==p && p->lock.iTable==1 ); 003659 p->lock.eLock = READ_LOCK; 003660 p->lock.pNext = pBt->pLock; 003661 pBt->pLock = &p->lock; 003662 } 003663 #endif 003664 } 003665 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ); 003666 if( p->inTrans>pBt->inTransaction ){ 003667 pBt->inTransaction = p->inTrans; 003668 } 003669 if( wrflag ){ 003670 MemPage *pPage1 = pBt->pPage1; 003671 #ifndef SQLITE_OMIT_SHARED_CACHE 003672 assert( !pBt->pWriter ); 003673 pBt->pWriter = p; 003674 pBt->btsFlags &= ~BTS_EXCLUSIVE; 003675 if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE; 003676 #endif 003677 003678 /* If the db-size header field is incorrect (as it may be if an old 003679 ** client has been writing the database file), update it now. Doing 003680 ** this sooner rather than later means the database size can safely 003681 ** re-read the database size from page 1 if a savepoint or transaction 003682 ** rollback occurs within the transaction. 003683 */ 003684 if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){ 003685 rc = sqlite3PagerWrite(pPage1->pDbPage); 003686 if( rc==SQLITE_OK ){ 003687 put4byte(&pPage1->aData[28], pBt->nPage); 003688 } 003689 } 003690 } 003691 } 003692 003693 trans_begun: 003694 if( rc==SQLITE_OK ){ 003695 if( pSchemaVersion ){ 003696 *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]); 003697 } 003698 if( wrflag ){ 003699 /* This call makes sure that the pager has the correct number of 003700 ** open savepoints. If the second parameter is greater than 0 and 003701 ** the sub-journal is not already open, then it will be opened here. 003702 */ 003703 rc = sqlite3PagerOpenSavepoint(pPager, p->db->nSavepoint); 003704 } 003705 } 003706 003707 btreeIntegrity(p); 003708 sqlite3BtreeLeave(p); 003709 return rc; 003710 } 003711 int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){ 003712 BtShared *pBt; 003713 if( p->sharable 003714 || p->inTrans==TRANS_NONE 003715 || (p->inTrans==TRANS_READ && wrflag!=0) 003716 ){ 003717 return btreeBeginTrans(p,wrflag,pSchemaVersion); 003718 } 003719 pBt = p->pBt; 003720 if( pSchemaVersion ){ 003721 *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]); 003722 } 003723 if( wrflag ){ 003724 /* This call makes sure that the pager has the correct number of 003725 ** open savepoints. If the second parameter is greater than 0 and 003726 ** the sub-journal is not already open, then it will be opened here. 003727 */ 003728 return sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint); 003729 }else{ 003730 return SQLITE_OK; 003731 } 003732 } 003733 003734 #ifndef SQLITE_OMIT_AUTOVACUUM 003735 003736 /* 003737 ** Set the pointer-map entries for all children of page pPage. Also, if 003738 ** pPage contains cells that point to overflow pages, set the pointer 003739 ** map entries for the overflow pages as well. 003740 */ 003741 static int setChildPtrmaps(MemPage *pPage){ 003742 int i; /* Counter variable */ 003743 int nCell; /* Number of cells in page pPage */ 003744 int rc; /* Return code */ 003745 BtShared *pBt = pPage->pBt; 003746 Pgno pgno = pPage->pgno; 003747 003748 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 003749 rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage); 003750 if( rc!=SQLITE_OK ) return rc; 003751 nCell = pPage->nCell; 003752 003753 for(i=0; i<nCell; i++){ 003754 u8 *pCell = findCell(pPage, i); 003755 003756 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc); 003757 003758 if( !pPage->leaf ){ 003759 Pgno childPgno = get4byte(pCell); 003760 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); 003761 } 003762 } 003763 003764 if( !pPage->leaf ){ 003765 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 003766 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); 003767 } 003768 003769 return rc; 003770 } 003771 003772 /* 003773 ** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so 003774 ** that it points to iTo. Parameter eType describes the type of pointer to 003775 ** be modified, as follows: 003776 ** 003777 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child 003778 ** page of pPage. 003779 ** 003780 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow 003781 ** page pointed to by one of the cells on pPage. 003782 ** 003783 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next 003784 ** overflow page in the list. 003785 */ 003786 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ 003787 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 003788 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 003789 if( eType==PTRMAP_OVERFLOW2 ){ 003790 /* The pointer is always the first 4 bytes of the page in this case. */ 003791 if( get4byte(pPage->aData)!=iFrom ){ 003792 return SQLITE_CORRUPT_PAGE(pPage); 003793 } 003794 put4byte(pPage->aData, iTo); 003795 }else{ 003796 int i; 003797 int nCell; 003798 int rc; 003799 003800 rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage); 003801 if( rc ) return rc; 003802 nCell = pPage->nCell; 003803 003804 for(i=0; i<nCell; i++){ 003805 u8 *pCell = findCell(pPage, i); 003806 if( eType==PTRMAP_OVERFLOW1 ){ 003807 CellInfo info; 003808 pPage->xParseCell(pPage, pCell, &info); 003809 if( info.nLocal<info.nPayload ){ 003810 if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){ 003811 return SQLITE_CORRUPT_PAGE(pPage); 003812 } 003813 if( iFrom==get4byte(pCell+info.nSize-4) ){ 003814 put4byte(pCell+info.nSize-4, iTo); 003815 break; 003816 } 003817 } 003818 }else{ 003819 if( pCell+4 > pPage->aData+pPage->pBt->usableSize ){ 003820 return SQLITE_CORRUPT_PAGE(pPage); 003821 } 003822 if( get4byte(pCell)==iFrom ){ 003823 put4byte(pCell, iTo); 003824 break; 003825 } 003826 } 003827 } 003828 003829 if( i==nCell ){ 003830 if( eType!=PTRMAP_BTREE || 003831 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){ 003832 return SQLITE_CORRUPT_PAGE(pPage); 003833 } 003834 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo); 003835 } 003836 } 003837 return SQLITE_OK; 003838 } 003839 003840 003841 /* 003842 ** Move the open database page pDbPage to location iFreePage in the 003843 ** database. The pDbPage reference remains valid. 003844 ** 003845 ** The isCommit flag indicates that there is no need to remember that 003846 ** the journal needs to be sync()ed before database page pDbPage->pgno 003847 ** can be written to. The caller has already promised not to write to that 003848 ** page. 003849 */ 003850 static int relocatePage( 003851 BtShared *pBt, /* Btree */ 003852 MemPage *pDbPage, /* Open page to move */ 003853 u8 eType, /* Pointer map 'type' entry for pDbPage */ 003854 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */ 003855 Pgno iFreePage, /* The location to move pDbPage to */ 003856 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */ 003857 ){ 003858 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */ 003859 Pgno iDbPage = pDbPage->pgno; 003860 Pager *pPager = pBt->pPager; 003861 int rc; 003862 003863 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 003864 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ); 003865 assert( sqlite3_mutex_held(pBt->mutex) ); 003866 assert( pDbPage->pBt==pBt ); 003867 if( iDbPage<3 ) return SQLITE_CORRUPT_BKPT; 003868 003869 /* Move page iDbPage from its current location to page number iFreePage */ 003870 TRACE(("AUTOVACUUM: Moving %u to free page %u (ptr page %u type %u)\n", 003871 iDbPage, iFreePage, iPtrPage, eType)); 003872 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit); 003873 if( rc!=SQLITE_OK ){ 003874 return rc; 003875 } 003876 pDbPage->pgno = iFreePage; 003877 003878 /* If pDbPage was a btree-page, then it may have child pages and/or cells 003879 ** that point to overflow pages. The pointer map entries for all these 003880 ** pages need to be changed. 003881 ** 003882 ** If pDbPage is an overflow page, then the first 4 bytes may store a 003883 ** pointer to a subsequent overflow page. If this is the case, then 003884 ** the pointer map needs to be updated for the subsequent overflow page. 003885 */ 003886 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){ 003887 rc = setChildPtrmaps(pDbPage); 003888 if( rc!=SQLITE_OK ){ 003889 return rc; 003890 } 003891 }else{ 003892 Pgno nextOvfl = get4byte(pDbPage->aData); 003893 if( nextOvfl!=0 ){ 003894 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc); 003895 if( rc!=SQLITE_OK ){ 003896 return rc; 003897 } 003898 } 003899 } 003900 003901 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so 003902 ** that it points at iFreePage. Also fix the pointer map entry for 003903 ** iPtrPage. 003904 */ 003905 if( eType!=PTRMAP_ROOTPAGE ){ 003906 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0); 003907 if( rc!=SQLITE_OK ){ 003908 return rc; 003909 } 003910 rc = sqlite3PagerWrite(pPtrPage->pDbPage); 003911 if( rc!=SQLITE_OK ){ 003912 releasePage(pPtrPage); 003913 return rc; 003914 } 003915 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType); 003916 releasePage(pPtrPage); 003917 if( rc==SQLITE_OK ){ 003918 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc); 003919 } 003920 } 003921 return rc; 003922 } 003923 003924 /* Forward declaration required by incrVacuumStep(). */ 003925 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); 003926 003927 /* 003928 ** Perform a single step of an incremental-vacuum. If successful, return 003929 ** SQLITE_OK. If there is no work to do (and therefore no point in 003930 ** calling this function again), return SQLITE_DONE. Or, if an error 003931 ** occurs, return some other error code. 003932 ** 003933 ** More specifically, this function attempts to re-organize the database so 003934 ** that the last page of the file currently in use is no longer in use. 003935 ** 003936 ** Parameter nFin is the number of pages that this database would contain 003937 ** were this function called until it returns SQLITE_DONE. 003938 ** 003939 ** If the bCommit parameter is non-zero, this function assumes that the 003940 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE 003941 ** or an error. bCommit is passed true for an auto-vacuum-on-commit 003942 ** operation, or false for an incremental vacuum. 003943 */ 003944 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){ 003945 Pgno nFreeList; /* Number of pages still on the free-list */ 003946 int rc; 003947 003948 assert( sqlite3_mutex_held(pBt->mutex) ); 003949 assert( iLastPg>nFin ); 003950 003951 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){ 003952 u8 eType; 003953 Pgno iPtrPage; 003954 003955 nFreeList = get4byte(&pBt->pPage1->aData[36]); 003956 if( nFreeList==0 ){ 003957 return SQLITE_DONE; 003958 } 003959 003960 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage); 003961 if( rc!=SQLITE_OK ){ 003962 return rc; 003963 } 003964 if( eType==PTRMAP_ROOTPAGE ){ 003965 return SQLITE_CORRUPT_BKPT; 003966 } 003967 003968 if( eType==PTRMAP_FREEPAGE ){ 003969 if( bCommit==0 ){ 003970 /* Remove the page from the files free-list. This is not required 003971 ** if bCommit is non-zero. In that case, the free-list will be 003972 ** truncated to zero after this function returns, so it doesn't 003973 ** matter if it still contains some garbage entries. 003974 */ 003975 Pgno iFreePg; 003976 MemPage *pFreePg; 003977 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT); 003978 if( rc!=SQLITE_OK ){ 003979 return rc; 003980 } 003981 assert( iFreePg==iLastPg ); 003982 releasePage(pFreePg); 003983 } 003984 } else { 003985 Pgno iFreePg; /* Index of free page to move pLastPg to */ 003986 MemPage *pLastPg; 003987 u8 eMode = BTALLOC_ANY; /* Mode parameter for allocateBtreePage() */ 003988 Pgno iNear = 0; /* nearby parameter for allocateBtreePage() */ 003989 003990 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0); 003991 if( rc!=SQLITE_OK ){ 003992 return rc; 003993 } 003994 003995 /* If bCommit is zero, this loop runs exactly once and page pLastPg 003996 ** is swapped with the first free page pulled off the free list. 003997 ** 003998 ** On the other hand, if bCommit is greater than zero, then keep 003999 ** looping until a free-page located within the first nFin pages 004000 ** of the file is found. 004001 */ 004002 if( bCommit==0 ){ 004003 eMode = BTALLOC_LE; 004004 iNear = nFin; 004005 } 004006 do { 004007 MemPage *pFreePg; 004008 Pgno dbSize = btreePagecount(pBt); 004009 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode); 004010 if( rc!=SQLITE_OK ){ 004011 releasePage(pLastPg); 004012 return rc; 004013 } 004014 releasePage(pFreePg); 004015 if( iFreePg>dbSize ){ 004016 releasePage(pLastPg); 004017 return SQLITE_CORRUPT_BKPT; 004018 } 004019 }while( bCommit && iFreePg>nFin ); 004020 assert( iFreePg<iLastPg ); 004021 004022 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit); 004023 releasePage(pLastPg); 004024 if( rc!=SQLITE_OK ){ 004025 return rc; 004026 } 004027 } 004028 } 004029 004030 if( bCommit==0 ){ 004031 do { 004032 iLastPg--; 004033 }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) ); 004034 pBt->bDoTruncate = 1; 004035 pBt->nPage = iLastPg; 004036 } 004037 return SQLITE_OK; 004038 } 004039 004040 /* 004041 ** The database opened by the first argument is an auto-vacuum database 004042 ** nOrig pages in size containing nFree free pages. Return the expected 004043 ** size of the database in pages following an auto-vacuum operation. 004044 */ 004045 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){ 004046 int nEntry; /* Number of entries on one ptrmap page */ 004047 Pgno nPtrmap; /* Number of PtrMap pages to be freed */ 004048 Pgno nFin; /* Return value */ 004049 004050 nEntry = pBt->usableSize/5; 004051 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry; 004052 nFin = nOrig - nFree - nPtrmap; 004053 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){ 004054 nFin--; 004055 } 004056 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){ 004057 nFin--; 004058 } 004059 004060 return nFin; 004061 } 004062 004063 /* 004064 ** A write-transaction must be opened before calling this function. 004065 ** It performs a single unit of work towards an incremental vacuum. 004066 ** 004067 ** If the incremental vacuum is finished after this function has run, 004068 ** SQLITE_DONE is returned. If it is not finished, but no error occurred, 004069 ** SQLITE_OK is returned. Otherwise an SQLite error code. 004070 */ 004071 int sqlite3BtreeIncrVacuum(Btree *p){ 004072 int rc; 004073 BtShared *pBt = p->pBt; 004074 004075 sqlite3BtreeEnter(p); 004076 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE ); 004077 if( !pBt->autoVacuum ){ 004078 rc = SQLITE_DONE; 004079 }else{ 004080 Pgno nOrig = btreePagecount(pBt); 004081 Pgno nFree = get4byte(&pBt->pPage1->aData[36]); 004082 Pgno nFin = finalDbSize(pBt, nOrig, nFree); 004083 004084 if( nOrig<nFin || nFree>=nOrig ){ 004085 rc = SQLITE_CORRUPT_BKPT; 004086 }else if( nFree>0 ){ 004087 rc = saveAllCursors(pBt, 0, 0); 004088 if( rc==SQLITE_OK ){ 004089 invalidateAllOverflowCache(pBt); 004090 rc = incrVacuumStep(pBt, nFin, nOrig, 0); 004091 } 004092 if( rc==SQLITE_OK ){ 004093 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 004094 put4byte(&pBt->pPage1->aData[28], pBt->nPage); 004095 } 004096 }else{ 004097 rc = SQLITE_DONE; 004098 } 004099 } 004100 sqlite3BtreeLeave(p); 004101 return rc; 004102 } 004103 004104 /* 004105 ** This routine is called prior to sqlite3PagerCommit when a transaction 004106 ** is committed for an auto-vacuum database. 004107 */ 004108 static int autoVacuumCommit(Btree *p){ 004109 int rc = SQLITE_OK; 004110 Pager *pPager; 004111 BtShared *pBt; 004112 sqlite3 *db; 004113 VVA_ONLY( int nRef ); 004114 004115 assert( p!=0 ); 004116 pBt = p->pBt; 004117 pPager = pBt->pPager; 004118 VVA_ONLY( nRef = sqlite3PagerRefcount(pPager); ) 004119 004120 assert( sqlite3_mutex_held(pBt->mutex) ); 004121 invalidateAllOverflowCache(pBt); 004122 assert(pBt->autoVacuum); 004123 if( !pBt->incrVacuum ){ 004124 Pgno nFin; /* Number of pages in database after autovacuuming */ 004125 Pgno nFree; /* Number of pages on the freelist initially */ 004126 Pgno nVac; /* Number of pages to vacuum */ 004127 Pgno iFree; /* The next page to be freed */ 004128 Pgno nOrig; /* Database size before freeing */ 004129 004130 nOrig = btreePagecount(pBt); 004131 if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){ 004132 /* It is not possible to create a database for which the final page 004133 ** is either a pointer-map page or the pending-byte page. If one 004134 ** is encountered, this indicates corruption. 004135 */ 004136 return SQLITE_CORRUPT_BKPT; 004137 } 004138 004139 nFree = get4byte(&pBt->pPage1->aData[36]); 004140 db = p->db; 004141 if( db->xAutovacPages ){ 004142 int iDb; 004143 for(iDb=0; ALWAYS(iDb<db->nDb); iDb++){ 004144 if( db->aDb[iDb].pBt==p ) break; 004145 } 004146 nVac = db->xAutovacPages( 004147 db->pAutovacPagesArg, 004148 db->aDb[iDb].zDbSName, 004149 nOrig, 004150 nFree, 004151 pBt->pageSize 004152 ); 004153 if( nVac>nFree ){ 004154 nVac = nFree; 004155 } 004156 if( nVac==0 ){ 004157 return SQLITE_OK; 004158 } 004159 }else{ 004160 nVac = nFree; 004161 } 004162 nFin = finalDbSize(pBt, nOrig, nVac); 004163 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT; 004164 if( nFin<nOrig ){ 004165 rc = saveAllCursors(pBt, 0, 0); 004166 } 004167 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){ 004168 rc = incrVacuumStep(pBt, nFin, iFree, nVac==nFree); 004169 } 004170 if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){ 004171 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 004172 if( nVac==nFree ){ 004173 put4byte(&pBt->pPage1->aData[32], 0); 004174 put4byte(&pBt->pPage1->aData[36], 0); 004175 } 004176 put4byte(&pBt->pPage1->aData[28], nFin); 004177 pBt->bDoTruncate = 1; 004178 pBt->nPage = nFin; 004179 } 004180 if( rc!=SQLITE_OK ){ 004181 sqlite3PagerRollback(pPager); 004182 } 004183 } 004184 004185 assert( nRef>=sqlite3PagerRefcount(pPager) ); 004186 return rc; 004187 } 004188 004189 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */ 004190 # define setChildPtrmaps(x) SQLITE_OK 004191 #endif 004192 004193 /* 004194 ** This routine does the first phase of a two-phase commit. This routine 004195 ** causes a rollback journal to be created (if it does not already exist) 004196 ** and populated with enough information so that if a power loss occurs 004197 ** the database can be restored to its original state by playing back 004198 ** the journal. Then the contents of the journal are flushed out to 004199 ** the disk. After the journal is safely on oxide, the changes to the 004200 ** database are written into the database file and flushed to oxide. 004201 ** At the end of this call, the rollback journal still exists on the 004202 ** disk and we are still holding all locks, so the transaction has not 004203 ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the 004204 ** commit process. 004205 ** 004206 ** This call is a no-op if no write-transaction is currently active on pBt. 004207 ** 004208 ** Otherwise, sync the database file for the btree pBt. zSuperJrnl points to 004209 ** the name of a super-journal file that should be written into the 004210 ** individual journal file, or is NULL, indicating no super-journal file 004211 ** (single database transaction). 004212 ** 004213 ** When this is called, the super-journal should already have been 004214 ** created, populated with this journal pointer and synced to disk. 004215 ** 004216 ** Once this is routine has returned, the only thing required to commit 004217 ** the write-transaction for this database file is to delete the journal. 004218 */ 004219 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zSuperJrnl){ 004220 int rc = SQLITE_OK; 004221 if( p->inTrans==TRANS_WRITE ){ 004222 BtShared *pBt = p->pBt; 004223 sqlite3BtreeEnter(p); 004224 #ifndef SQLITE_OMIT_AUTOVACUUM 004225 if( pBt->autoVacuum ){ 004226 rc = autoVacuumCommit(p); 004227 if( rc!=SQLITE_OK ){ 004228 sqlite3BtreeLeave(p); 004229 return rc; 004230 } 004231 } 004232 if( pBt->bDoTruncate ){ 004233 sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage); 004234 } 004235 #endif 004236 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zSuperJrnl, 0); 004237 sqlite3BtreeLeave(p); 004238 } 004239 return rc; 004240 } 004241 004242 /* 004243 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback() 004244 ** at the conclusion of a transaction. 004245 */ 004246 static void btreeEndTransaction(Btree *p){ 004247 BtShared *pBt = p->pBt; 004248 sqlite3 *db = p->db; 004249 assert( sqlite3BtreeHoldsMutex(p) ); 004250 004251 #ifndef SQLITE_OMIT_AUTOVACUUM 004252 pBt->bDoTruncate = 0; 004253 #endif 004254 if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){ 004255 /* If there are other active statements that belong to this database 004256 ** handle, downgrade to a read-only transaction. The other statements 004257 ** may still be reading from the database. */ 004258 downgradeAllSharedCacheTableLocks(p); 004259 p->inTrans = TRANS_READ; 004260 }else{ 004261 /* If the handle had any kind of transaction open, decrement the 004262 ** transaction count of the shared btree. If the transaction count 004263 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused() 004264 ** call below will unlock the pager. */ 004265 if( p->inTrans!=TRANS_NONE ){ 004266 clearAllSharedCacheTableLocks(p); 004267 pBt->nTransaction--; 004268 if( 0==pBt->nTransaction ){ 004269 pBt->inTransaction = TRANS_NONE; 004270 } 004271 } 004272 004273 /* Set the current transaction state to TRANS_NONE and unlock the 004274 ** pager if this call closed the only read or write transaction. */ 004275 p->inTrans = TRANS_NONE; 004276 unlockBtreeIfUnused(pBt); 004277 } 004278 004279 btreeIntegrity(p); 004280 } 004281 004282 /* 004283 ** Commit the transaction currently in progress. 004284 ** 004285 ** This routine implements the second phase of a 2-phase commit. The 004286 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should 004287 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne() 004288 ** routine did all the work of writing information out to disk and flushing the 004289 ** contents so that they are written onto the disk platter. All this 004290 ** routine has to do is delete or truncate or zero the header in the 004291 ** the rollback journal (which causes the transaction to commit) and 004292 ** drop locks. 004293 ** 004294 ** Normally, if an error occurs while the pager layer is attempting to 004295 ** finalize the underlying journal file, this function returns an error and 004296 ** the upper layer will attempt a rollback. However, if the second argument 004297 ** is non-zero then this b-tree transaction is part of a multi-file 004298 ** transaction. In this case, the transaction has already been committed 004299 ** (by deleting a super-journal file) and the caller will ignore this 004300 ** functions return code. So, even if an error occurs in the pager layer, 004301 ** reset the b-tree objects internal state to indicate that the write 004302 ** transaction has been closed. This is quite safe, as the pager will have 004303 ** transitioned to the error state. 004304 ** 004305 ** This will release the write lock on the database file. If there 004306 ** are no active cursors, it also releases the read lock. 004307 */ 004308 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){ 004309 004310 if( p->inTrans==TRANS_NONE ) return SQLITE_OK; 004311 sqlite3BtreeEnter(p); 004312 btreeIntegrity(p); 004313 004314 /* If the handle has a write-transaction open, commit the shared-btrees 004315 ** transaction and set the shared state to TRANS_READ. 004316 */ 004317 if( p->inTrans==TRANS_WRITE ){ 004318 int rc; 004319 BtShared *pBt = p->pBt; 004320 assert( pBt->inTransaction==TRANS_WRITE ); 004321 assert( pBt->nTransaction>0 ); 004322 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager); 004323 if( rc!=SQLITE_OK && bCleanup==0 ){ 004324 sqlite3BtreeLeave(p); 004325 return rc; 004326 } 004327 p->iBDataVersion--; /* Compensate for pPager->iDataVersion++; */ 004328 pBt->inTransaction = TRANS_READ; 004329 btreeClearHasContent(pBt); 004330 } 004331 004332 btreeEndTransaction(p); 004333 sqlite3BtreeLeave(p); 004334 return SQLITE_OK; 004335 } 004336 004337 /* 004338 ** Do both phases of a commit. 004339 */ 004340 int sqlite3BtreeCommit(Btree *p){ 004341 int rc; 004342 sqlite3BtreeEnter(p); 004343 rc = sqlite3BtreeCommitPhaseOne(p, 0); 004344 if( rc==SQLITE_OK ){ 004345 rc = sqlite3BtreeCommitPhaseTwo(p, 0); 004346 } 004347 sqlite3BtreeLeave(p); 004348 return rc; 004349 } 004350 004351 /* 004352 ** This routine sets the state to CURSOR_FAULT and the error 004353 ** code to errCode for every cursor on any BtShared that pBtree 004354 ** references. Or if the writeOnly flag is set to 1, then only 004355 ** trip write cursors and leave read cursors unchanged. 004356 ** 004357 ** Every cursor is a candidate to be tripped, including cursors 004358 ** that belong to other database connections that happen to be 004359 ** sharing the cache with pBtree. 004360 ** 004361 ** This routine gets called when a rollback occurs. If the writeOnly 004362 ** flag is true, then only write-cursors need be tripped - read-only 004363 ** cursors save their current positions so that they may continue 004364 ** following the rollback. Or, if writeOnly is false, all cursors are 004365 ** tripped. In general, writeOnly is false if the transaction being 004366 ** rolled back modified the database schema. In this case b-tree root 004367 ** pages may be moved or deleted from the database altogether, making 004368 ** it unsafe for read cursors to continue. 004369 ** 004370 ** If the writeOnly flag is true and an error is encountered while 004371 ** saving the current position of a read-only cursor, all cursors, 004372 ** including all read-cursors are tripped. 004373 ** 004374 ** SQLITE_OK is returned if successful, or if an error occurs while 004375 ** saving a cursor position, an SQLite error code. 004376 */ 004377 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){ 004378 BtCursor *p; 004379 int rc = SQLITE_OK; 004380 004381 assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 ); 004382 if( pBtree ){ 004383 sqlite3BtreeEnter(pBtree); 004384 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 004385 if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){ 004386 if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){ 004387 rc = saveCursorPosition(p); 004388 if( rc!=SQLITE_OK ){ 004389 (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0); 004390 break; 004391 } 004392 } 004393 }else{ 004394 sqlite3BtreeClearCursor(p); 004395 p->eState = CURSOR_FAULT; 004396 p->skipNext = errCode; 004397 } 004398 btreeReleaseAllCursorPages(p); 004399 } 004400 sqlite3BtreeLeave(pBtree); 004401 } 004402 return rc; 004403 } 004404 004405 /* 004406 ** Set the pBt->nPage field correctly, according to the current 004407 ** state of the database. Assume pBt->pPage1 is valid. 004408 */ 004409 static void btreeSetNPage(BtShared *pBt, MemPage *pPage1){ 004410 int nPage = get4byte(&pPage1->aData[28]); 004411 testcase( nPage==0 ); 004412 if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage); 004413 testcase( pBt->nPage!=(u32)nPage ); 004414 pBt->nPage = nPage; 004415 } 004416 004417 /* 004418 ** Rollback the transaction in progress. 004419 ** 004420 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped). 004421 ** Only write cursors are tripped if writeOnly is true but all cursors are 004422 ** tripped if writeOnly is false. Any attempt to use 004423 ** a tripped cursor will result in an error. 004424 ** 004425 ** This will release the write lock on the database file. If there 004426 ** are no active cursors, it also releases the read lock. 004427 */ 004428 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){ 004429 int rc; 004430 BtShared *pBt = p->pBt; 004431 MemPage *pPage1; 004432 004433 assert( writeOnly==1 || writeOnly==0 ); 004434 assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK ); 004435 sqlite3BtreeEnter(p); 004436 if( tripCode==SQLITE_OK ){ 004437 rc = tripCode = saveAllCursors(pBt, 0, 0); 004438 if( rc ) writeOnly = 0; 004439 }else{ 004440 rc = SQLITE_OK; 004441 } 004442 if( tripCode ){ 004443 int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly); 004444 assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) ); 004445 if( rc2!=SQLITE_OK ) rc = rc2; 004446 } 004447 btreeIntegrity(p); 004448 004449 if( p->inTrans==TRANS_WRITE ){ 004450 int rc2; 004451 004452 assert( TRANS_WRITE==pBt->inTransaction ); 004453 rc2 = sqlite3PagerRollback(pBt->pPager); 004454 if( rc2!=SQLITE_OK ){ 004455 rc = rc2; 004456 } 004457 004458 /* The rollback may have destroyed the pPage1->aData value. So 004459 ** call btreeGetPage() on page 1 again to make 004460 ** sure pPage1->aData is set correctly. */ 004461 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){ 004462 btreeSetNPage(pBt, pPage1); 004463 releasePageOne(pPage1); 004464 } 004465 assert( countValidCursors(pBt, 1)==0 ); 004466 pBt->inTransaction = TRANS_READ; 004467 btreeClearHasContent(pBt); 004468 } 004469 004470 btreeEndTransaction(p); 004471 sqlite3BtreeLeave(p); 004472 return rc; 004473 } 004474 004475 /* 004476 ** Start a statement subtransaction. The subtransaction can be rolled 004477 ** back independently of the main transaction. You must start a transaction 004478 ** before starting a subtransaction. The subtransaction is ended automatically 004479 ** if the main transaction commits or rolls back. 004480 ** 004481 ** Statement subtransactions are used around individual SQL statements 004482 ** that are contained within a BEGIN...COMMIT block. If a constraint 004483 ** error occurs within the statement, the effect of that one statement 004484 ** can be rolled back without having to rollback the entire transaction. 004485 ** 004486 ** A statement sub-transaction is implemented as an anonymous savepoint. The 004487 ** value passed as the second parameter is the total number of savepoints, 004488 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there 004489 ** are no active savepoints and no other statement-transactions open, 004490 ** iStatement is 1. This anonymous savepoint can be released or rolled back 004491 ** using the sqlite3BtreeSavepoint() function. 004492 */ 004493 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){ 004494 int rc; 004495 BtShared *pBt = p->pBt; 004496 sqlite3BtreeEnter(p); 004497 assert( p->inTrans==TRANS_WRITE ); 004498 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 004499 assert( iStatement>0 ); 004500 assert( iStatement>p->db->nSavepoint ); 004501 assert( pBt->inTransaction==TRANS_WRITE ); 004502 /* At the pager level, a statement transaction is a savepoint with 004503 ** an index greater than all savepoints created explicitly using 004504 ** SQL statements. It is illegal to open, release or rollback any 004505 ** such savepoints while the statement transaction savepoint is active. 004506 */ 004507 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement); 004508 sqlite3BtreeLeave(p); 004509 return rc; 004510 } 004511 004512 /* 004513 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK 004514 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the 004515 ** savepoint identified by parameter iSavepoint, depending on the value 004516 ** of op. 004517 ** 004518 ** Normally, iSavepoint is greater than or equal to zero. However, if op is 004519 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the 004520 ** contents of the entire transaction are rolled back. This is different 004521 ** from a normal transaction rollback, as no locks are released and the 004522 ** transaction remains open. 004523 */ 004524 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){ 004525 int rc = SQLITE_OK; 004526 if( p && p->inTrans==TRANS_WRITE ){ 004527 BtShared *pBt = p->pBt; 004528 assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK ); 004529 assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) ); 004530 sqlite3BtreeEnter(p); 004531 if( op==SAVEPOINT_ROLLBACK ){ 004532 rc = saveAllCursors(pBt, 0, 0); 004533 } 004534 if( rc==SQLITE_OK ){ 004535 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint); 004536 } 004537 if( rc==SQLITE_OK ){ 004538 if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){ 004539 pBt->nPage = 0; 004540 } 004541 rc = newDatabase(pBt); 004542 btreeSetNPage(pBt, pBt->pPage1); 004543 004544 /* pBt->nPage might be zero if the database was corrupt when 004545 ** the transaction was started. Otherwise, it must be at least 1. */ 004546 assert( CORRUPT_DB || pBt->nPage>0 ); 004547 } 004548 sqlite3BtreeLeave(p); 004549 } 004550 return rc; 004551 } 004552 004553 /* 004554 ** Create a new cursor for the BTree whose root is on the page 004555 ** iTable. If a read-only cursor is requested, it is assumed that 004556 ** the caller already has at least a read-only transaction open 004557 ** on the database already. If a write-cursor is requested, then 004558 ** the caller is assumed to have an open write transaction. 004559 ** 004560 ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only 004561 ** be used for reading. If the BTREE_WRCSR bit is set, then the cursor 004562 ** can be used for reading or for writing if other conditions for writing 004563 ** are also met. These are the conditions that must be met in order 004564 ** for writing to be allowed: 004565 ** 004566 ** 1: The cursor must have been opened with wrFlag containing BTREE_WRCSR 004567 ** 004568 ** 2: Other database connections that share the same pager cache 004569 ** but which are not in the READ_UNCOMMITTED state may not have 004570 ** cursors open with wrFlag==0 on the same table. Otherwise 004571 ** the changes made by this write cursor would be visible to 004572 ** the read cursors in the other database connection. 004573 ** 004574 ** 3: The database must be writable (not on read-only media) 004575 ** 004576 ** 4: There must be an active transaction. 004577 ** 004578 ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR 004579 ** is set. If FORDELETE is set, that is a hint to the implementation that 004580 ** this cursor will only be used to seek to and delete entries of an index 004581 ** as part of a larger DELETE statement. The FORDELETE hint is not used by 004582 ** this implementation. But in a hypothetical alternative storage engine 004583 ** in which index entries are automatically deleted when corresponding table 004584 ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE 004585 ** operations on this cursor can be no-ops and all READ operations can 004586 ** return a null row (2-bytes: 0x01 0x00). 004587 ** 004588 ** No checking is done to make sure that page iTable really is the 004589 ** root page of a b-tree. If it is not, then the cursor acquired 004590 ** will not work correctly. 004591 ** 004592 ** It is assumed that the sqlite3BtreeCursorZero() has been called 004593 ** on pCur to initialize the memory space prior to invoking this routine. 004594 */ 004595 static int btreeCursor( 004596 Btree *p, /* The btree */ 004597 Pgno iTable, /* Root page of table to open */ 004598 int wrFlag, /* 1 to write. 0 read-only */ 004599 struct KeyInfo *pKeyInfo, /* First arg to comparison function */ 004600 BtCursor *pCur /* Space for new cursor */ 004601 ){ 004602 BtShared *pBt = p->pBt; /* Shared b-tree handle */ 004603 BtCursor *pX; /* Looping over other all cursors */ 004604 004605 assert( sqlite3BtreeHoldsMutex(p) ); 004606 assert( wrFlag==0 004607 || wrFlag==BTREE_WRCSR 004608 || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE) 004609 ); 004610 004611 /* The following assert statements verify that if this is a sharable 004612 ** b-tree database, the connection is holding the required table locks, 004613 ** and that no other connection has any open cursor that conflicts with 004614 ** this lock. The iTable<1 term disables the check for corrupt schemas. */ 004615 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) 004616 || iTable<1 ); 004617 assert( wrFlag==0 || !hasReadConflicts(p, iTable) ); 004618 004619 /* Assert that the caller has opened the required transaction. */ 004620 assert( p->inTrans>TRANS_NONE ); 004621 assert( wrFlag==0 || p->inTrans==TRANS_WRITE ); 004622 assert( pBt->pPage1 && pBt->pPage1->aData ); 004623 assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 ); 004624 004625 if( iTable<=1 ){ 004626 if( iTable<1 ){ 004627 return SQLITE_CORRUPT_BKPT; 004628 }else if( btreePagecount(pBt)==0 ){ 004629 assert( wrFlag==0 ); 004630 iTable = 0; 004631 } 004632 } 004633 004634 /* Now that no other errors can occur, finish filling in the BtCursor 004635 ** variables and link the cursor into the BtShared list. */ 004636 pCur->pgnoRoot = iTable; 004637 pCur->iPage = -1; 004638 pCur->pKeyInfo = pKeyInfo; 004639 pCur->pBtree = p; 004640 pCur->pBt = pBt; 004641 pCur->curFlags = 0; 004642 /* If there are two or more cursors on the same btree, then all such 004643 ** cursors *must* have the BTCF_Multiple flag set. */ 004644 for(pX=pBt->pCursor; pX; pX=pX->pNext){ 004645 if( pX->pgnoRoot==iTable ){ 004646 pX->curFlags |= BTCF_Multiple; 004647 pCur->curFlags = BTCF_Multiple; 004648 } 004649 } 004650 pCur->eState = CURSOR_INVALID; 004651 pCur->pNext = pBt->pCursor; 004652 pBt->pCursor = pCur; 004653 if( wrFlag ){ 004654 pCur->curFlags |= BTCF_WriteFlag; 004655 pCur->curPagerFlags = 0; 004656 if( pBt->pTmpSpace==0 ) return allocateTempSpace(pBt); 004657 }else{ 004658 pCur->curPagerFlags = PAGER_GET_READONLY; 004659 } 004660 return SQLITE_OK; 004661 } 004662 static int btreeCursorWithLock( 004663 Btree *p, /* The btree */ 004664 Pgno iTable, /* Root page of table to open */ 004665 int wrFlag, /* 1 to write. 0 read-only */ 004666 struct KeyInfo *pKeyInfo, /* First arg to comparison function */ 004667 BtCursor *pCur /* Space for new cursor */ 004668 ){ 004669 int rc; 004670 sqlite3BtreeEnter(p); 004671 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); 004672 sqlite3BtreeLeave(p); 004673 return rc; 004674 } 004675 int sqlite3BtreeCursor( 004676 Btree *p, /* The btree */ 004677 Pgno iTable, /* Root page of table to open */ 004678 int wrFlag, /* 1 to write. 0 read-only */ 004679 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */ 004680 BtCursor *pCur /* Write new cursor here */ 004681 ){ 004682 if( p->sharable ){ 004683 return btreeCursorWithLock(p, iTable, wrFlag, pKeyInfo, pCur); 004684 }else{ 004685 return btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); 004686 } 004687 } 004688 004689 /* 004690 ** Return the size of a BtCursor object in bytes. 004691 ** 004692 ** This interfaces is needed so that users of cursors can preallocate 004693 ** sufficient storage to hold a cursor. The BtCursor object is opaque 004694 ** to users so they cannot do the sizeof() themselves - they must call 004695 ** this routine. 004696 */ 004697 int sqlite3BtreeCursorSize(void){ 004698 return ROUND8(sizeof(BtCursor)); 004699 } 004700 004701 /* 004702 ** Initialize memory that will be converted into a BtCursor object. 004703 ** 004704 ** The simple approach here would be to memset() the entire object 004705 ** to zero. But it turns out that the apPage[] and aiIdx[] arrays 004706 ** do not need to be zeroed and they are large, so we can save a lot 004707 ** of run-time by skipping the initialization of those elements. 004708 */ 004709 void sqlite3BtreeCursorZero(BtCursor *p){ 004710 memset(p, 0, offsetof(BtCursor, BTCURSOR_FIRST_UNINIT)); 004711 } 004712 004713 /* 004714 ** Close a cursor. The read lock on the database file is released 004715 ** when the last cursor is closed. 004716 */ 004717 int sqlite3BtreeCloseCursor(BtCursor *pCur){ 004718 Btree *pBtree = pCur->pBtree; 004719 if( pBtree ){ 004720 BtShared *pBt = pCur->pBt; 004721 sqlite3BtreeEnter(pBtree); 004722 assert( pBt->pCursor!=0 ); 004723 if( pBt->pCursor==pCur ){ 004724 pBt->pCursor = pCur->pNext; 004725 }else{ 004726 BtCursor *pPrev = pBt->pCursor; 004727 do{ 004728 if( pPrev->pNext==pCur ){ 004729 pPrev->pNext = pCur->pNext; 004730 break; 004731 } 004732 pPrev = pPrev->pNext; 004733 }while( ALWAYS(pPrev) ); 004734 } 004735 btreeReleaseAllCursorPages(pCur); 004736 unlockBtreeIfUnused(pBt); 004737 sqlite3_free(pCur->aOverflow); 004738 sqlite3_free(pCur->pKey); 004739 if( (pBt->openFlags & BTREE_SINGLE) && pBt->pCursor==0 ){ 004740 /* Since the BtShared is not sharable, there is no need to 004741 ** worry about the missing sqlite3BtreeLeave() call here. */ 004742 assert( pBtree->sharable==0 ); 004743 sqlite3BtreeClose(pBtree); 004744 }else{ 004745 sqlite3BtreeLeave(pBtree); 004746 } 004747 pCur->pBtree = 0; 004748 } 004749 return SQLITE_OK; 004750 } 004751 004752 /* 004753 ** Make sure the BtCursor* given in the argument has a valid 004754 ** BtCursor.info structure. If it is not already valid, call 004755 ** btreeParseCell() to fill it in. 004756 ** 004757 ** BtCursor.info is a cache of the information in the current cell. 004758 ** Using this cache reduces the number of calls to btreeParseCell(). 004759 */ 004760 #ifndef NDEBUG 004761 static int cellInfoEqual(CellInfo *a, CellInfo *b){ 004762 if( a->nKey!=b->nKey ) return 0; 004763 if( a->pPayload!=b->pPayload ) return 0; 004764 if( a->nPayload!=b->nPayload ) return 0; 004765 if( a->nLocal!=b->nLocal ) return 0; 004766 if( a->nSize!=b->nSize ) return 0; 004767 return 1; 004768 } 004769 static void assertCellInfo(BtCursor *pCur){ 004770 CellInfo info; 004771 memset(&info, 0, sizeof(info)); 004772 btreeParseCell(pCur->pPage, pCur->ix, &info); 004773 assert( CORRUPT_DB || cellInfoEqual(&info, &pCur->info) ); 004774 } 004775 #else 004776 #define assertCellInfo(x) 004777 #endif 004778 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){ 004779 if( pCur->info.nSize==0 ){ 004780 pCur->curFlags |= BTCF_ValidNKey; 004781 btreeParseCell(pCur->pPage,pCur->ix,&pCur->info); 004782 }else{ 004783 assertCellInfo(pCur); 004784 } 004785 } 004786 004787 #ifndef NDEBUG /* The next routine used only within assert() statements */ 004788 /* 004789 ** Return true if the given BtCursor is valid. A valid cursor is one 004790 ** that is currently pointing to a row in a (non-empty) table. 004791 ** This is a verification routine is used only within assert() statements. 004792 */ 004793 int sqlite3BtreeCursorIsValid(BtCursor *pCur){ 004794 return pCur && pCur->eState==CURSOR_VALID; 004795 } 004796 #endif /* NDEBUG */ 004797 int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){ 004798 assert( pCur!=0 ); 004799 return pCur->eState==CURSOR_VALID; 004800 } 004801 004802 /* 004803 ** Return the value of the integer key or "rowid" for a table btree. 004804 ** This routine is only valid for a cursor that is pointing into a 004805 ** ordinary table btree. If the cursor points to an index btree or 004806 ** is invalid, the result of this routine is undefined. 004807 */ 004808 i64 sqlite3BtreeIntegerKey(BtCursor *pCur){ 004809 assert( cursorHoldsMutex(pCur) ); 004810 assert( pCur->eState==CURSOR_VALID ); 004811 assert( pCur->curIntKey ); 004812 getCellInfo(pCur); 004813 return pCur->info.nKey; 004814 } 004815 004816 /* 004817 ** Pin or unpin a cursor. 004818 */ 004819 void sqlite3BtreeCursorPin(BtCursor *pCur){ 004820 assert( (pCur->curFlags & BTCF_Pinned)==0 ); 004821 pCur->curFlags |= BTCF_Pinned; 004822 } 004823 void sqlite3BtreeCursorUnpin(BtCursor *pCur){ 004824 assert( (pCur->curFlags & BTCF_Pinned)!=0 ); 004825 pCur->curFlags &= ~BTCF_Pinned; 004826 } 004827 004828 /* 004829 ** Return the offset into the database file for the start of the 004830 ** payload to which the cursor is pointing. 004831 */ 004832 i64 sqlite3BtreeOffset(BtCursor *pCur){ 004833 assert( cursorHoldsMutex(pCur) ); 004834 assert( pCur->eState==CURSOR_VALID ); 004835 getCellInfo(pCur); 004836 return (i64)pCur->pBt->pageSize*((i64)pCur->pPage->pgno - 1) + 004837 (i64)(pCur->info.pPayload - pCur->pPage->aData); 004838 } 004839 004840 /* 004841 ** Return the number of bytes of payload for the entry that pCur is 004842 ** currently pointing to. For table btrees, this will be the amount 004843 ** of data. For index btrees, this will be the size of the key. 004844 ** 004845 ** The caller must guarantee that the cursor is pointing to a non-NULL 004846 ** valid entry. In other words, the calling procedure must guarantee 004847 ** that the cursor has Cursor.eState==CURSOR_VALID. 004848 */ 004849 u32 sqlite3BtreePayloadSize(BtCursor *pCur){ 004850 assert( cursorHoldsMutex(pCur) ); 004851 assert( pCur->eState==CURSOR_VALID ); 004852 getCellInfo(pCur); 004853 return pCur->info.nPayload; 004854 } 004855 004856 /* 004857 ** Return an upper bound on the size of any record for the table 004858 ** that the cursor is pointing into. 004859 ** 004860 ** This is an optimization. Everything will still work if this 004861 ** routine always returns 2147483647 (which is the largest record 004862 ** that SQLite can handle) or more. But returning a smaller value might 004863 ** prevent large memory allocations when trying to interpret a 004864 ** corrupt database. 004865 ** 004866 ** The current implementation merely returns the size of the underlying 004867 ** database file. 004868 */ 004869 sqlite3_int64 sqlite3BtreeMaxRecordSize(BtCursor *pCur){ 004870 assert( cursorHoldsMutex(pCur) ); 004871 assert( pCur->eState==CURSOR_VALID ); 004872 return pCur->pBt->pageSize * (sqlite3_int64)pCur->pBt->nPage; 004873 } 004874 004875 /* 004876 ** Given the page number of an overflow page in the database (parameter 004877 ** ovfl), this function finds the page number of the next page in the 004878 ** linked list of overflow pages. If possible, it uses the auto-vacuum 004879 ** pointer-map data instead of reading the content of page ovfl to do so. 004880 ** 004881 ** If an error occurs an SQLite error code is returned. Otherwise: 004882 ** 004883 ** The page number of the next overflow page in the linked list is 004884 ** written to *pPgnoNext. If page ovfl is the last page in its linked 004885 ** list, *pPgnoNext is set to zero. 004886 ** 004887 ** If ppPage is not NULL, and a reference to the MemPage object corresponding 004888 ** to page number pOvfl was obtained, then *ppPage is set to point to that 004889 ** reference. It is the responsibility of the caller to call releasePage() 004890 ** on *ppPage to free the reference. In no reference was obtained (because 004891 ** the pointer-map was used to obtain the value for *pPgnoNext), then 004892 ** *ppPage is set to zero. 004893 */ 004894 static int getOverflowPage( 004895 BtShared *pBt, /* The database file */ 004896 Pgno ovfl, /* Current overflow page number */ 004897 MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */ 004898 Pgno *pPgnoNext /* OUT: Next overflow page number */ 004899 ){ 004900 Pgno next = 0; 004901 MemPage *pPage = 0; 004902 int rc = SQLITE_OK; 004903 004904 assert( sqlite3_mutex_held(pBt->mutex) ); 004905 assert(pPgnoNext); 004906 004907 #ifndef SQLITE_OMIT_AUTOVACUUM 004908 /* Try to find the next page in the overflow list using the 004909 ** autovacuum pointer-map pages. Guess that the next page in 004910 ** the overflow list is page number (ovfl+1). If that guess turns 004911 ** out to be wrong, fall back to loading the data of page 004912 ** number ovfl to determine the next page number. 004913 */ 004914 if( pBt->autoVacuum ){ 004915 Pgno pgno; 004916 Pgno iGuess = ovfl+1; 004917 u8 eType; 004918 004919 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){ 004920 iGuess++; 004921 } 004922 004923 if( iGuess<=btreePagecount(pBt) ){ 004924 rc = ptrmapGet(pBt, iGuess, &eType, &pgno); 004925 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ 004926 next = iGuess; 004927 rc = SQLITE_DONE; 004928 } 004929 } 004930 } 004931 #endif 004932 004933 assert( next==0 || rc==SQLITE_DONE ); 004934 if( rc==SQLITE_OK ){ 004935 rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0); 004936 assert( rc==SQLITE_OK || pPage==0 ); 004937 if( rc==SQLITE_OK ){ 004938 next = get4byte(pPage->aData); 004939 } 004940 } 004941 004942 *pPgnoNext = next; 004943 if( ppPage ){ 004944 *ppPage = pPage; 004945 }else{ 004946 releasePage(pPage); 004947 } 004948 return (rc==SQLITE_DONE ? SQLITE_OK : rc); 004949 } 004950 004951 /* 004952 ** Copy data from a buffer to a page, or from a page to a buffer. 004953 ** 004954 ** pPayload is a pointer to data stored on database page pDbPage. 004955 ** If argument eOp is false, then nByte bytes of data are copied 004956 ** from pPayload to the buffer pointed at by pBuf. If eOp is true, 004957 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes 004958 ** of data are copied from the buffer pBuf to pPayload. 004959 ** 004960 ** SQLITE_OK is returned on success, otherwise an error code. 004961 */ 004962 static int copyPayload( 004963 void *pPayload, /* Pointer to page data */ 004964 void *pBuf, /* Pointer to buffer */ 004965 int nByte, /* Number of bytes to copy */ 004966 int eOp, /* 0 -> copy from page, 1 -> copy to page */ 004967 DbPage *pDbPage /* Page containing pPayload */ 004968 ){ 004969 if( eOp ){ 004970 /* Copy data from buffer to page (a write operation) */ 004971 int rc = sqlite3PagerWrite(pDbPage); 004972 if( rc!=SQLITE_OK ){ 004973 return rc; 004974 } 004975 memcpy(pPayload, pBuf, nByte); 004976 }else{ 004977 /* Copy data from page to buffer (a read operation) */ 004978 memcpy(pBuf, pPayload, nByte); 004979 } 004980 return SQLITE_OK; 004981 } 004982 004983 /* 004984 ** This function is used to read or overwrite payload information 004985 ** for the entry that the pCur cursor is pointing to. The eOp 004986 ** argument is interpreted as follows: 004987 ** 004988 ** 0: The operation is a read. Populate the overflow cache. 004989 ** 1: The operation is a write. Populate the overflow cache. 004990 ** 004991 ** A total of "amt" bytes are read or written beginning at "offset". 004992 ** Data is read to or from the buffer pBuf. 004993 ** 004994 ** The content being read or written might appear on the main page 004995 ** or be scattered out on multiple overflow pages. 004996 ** 004997 ** If the current cursor entry uses one or more overflow pages 004998 ** this function may allocate space for and lazily populate 004999 ** the overflow page-list cache array (BtCursor.aOverflow). 005000 ** Subsequent calls use this cache to make seeking to the supplied offset 005001 ** more efficient. 005002 ** 005003 ** Once an overflow page-list cache has been allocated, it must be 005004 ** invalidated if some other cursor writes to the same table, or if 005005 ** the cursor is moved to a different row. Additionally, in auto-vacuum 005006 ** mode, the following events may invalidate an overflow page-list cache. 005007 ** 005008 ** * An incremental vacuum, 005009 ** * A commit in auto_vacuum="full" mode, 005010 ** * Creating a table (may require moving an overflow page). 005011 */ 005012 static int accessPayload( 005013 BtCursor *pCur, /* Cursor pointing to entry to read from */ 005014 u32 offset, /* Begin reading this far into payload */ 005015 u32 amt, /* Read this many bytes */ 005016 unsigned char *pBuf, /* Write the bytes into this buffer */ 005017 int eOp /* zero to read. non-zero to write. */ 005018 ){ 005019 unsigned char *aPayload; 005020 int rc = SQLITE_OK; 005021 int iIdx = 0; 005022 MemPage *pPage = pCur->pPage; /* Btree page of current entry */ 005023 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */ 005024 #ifdef SQLITE_DIRECT_OVERFLOW_READ 005025 unsigned char * const pBufStart = pBuf; /* Start of original out buffer */ 005026 #endif 005027 005028 assert( pPage ); 005029 assert( eOp==0 || eOp==1 ); 005030 assert( pCur->eState==CURSOR_VALID ); 005031 if( pCur->ix>=pPage->nCell ){ 005032 return SQLITE_CORRUPT_PAGE(pPage); 005033 } 005034 assert( cursorHoldsMutex(pCur) ); 005035 005036 getCellInfo(pCur); 005037 aPayload = pCur->info.pPayload; 005038 assert( offset+amt <= pCur->info.nPayload ); 005039 005040 assert( aPayload > pPage->aData ); 005041 if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){ 005042 /* Trying to read or write past the end of the data is an error. The 005043 ** conditional above is really: 005044 ** &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] 005045 ** but is recast into its current form to avoid integer overflow problems 005046 */ 005047 return SQLITE_CORRUPT_PAGE(pPage); 005048 } 005049 005050 /* Check if data must be read/written to/from the btree page itself. */ 005051 if( offset<pCur->info.nLocal ){ 005052 int a = amt; 005053 if( a+offset>pCur->info.nLocal ){ 005054 a = pCur->info.nLocal - offset; 005055 } 005056 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage); 005057 offset = 0; 005058 pBuf += a; 005059 amt -= a; 005060 }else{ 005061 offset -= pCur->info.nLocal; 005062 } 005063 005064 005065 if( rc==SQLITE_OK && amt>0 ){ 005066 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ 005067 Pgno nextPage; 005068 005069 nextPage = get4byte(&aPayload[pCur->info.nLocal]); 005070 005071 /* If the BtCursor.aOverflow[] has not been allocated, allocate it now. 005072 ** 005073 ** The aOverflow[] array is sized at one entry for each overflow page 005074 ** in the overflow chain. The page number of the first overflow page is 005075 ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array 005076 ** means "not yet known" (the cache is lazily populated). 005077 */ 005078 if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){ 005079 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; 005080 if( pCur->aOverflow==0 005081 || nOvfl*(int)sizeof(Pgno) > sqlite3MallocSize(pCur->aOverflow) 005082 ){ 005083 Pgno *aNew = (Pgno*)sqlite3Realloc( 005084 pCur->aOverflow, nOvfl*2*sizeof(Pgno) 005085 ); 005086 if( aNew==0 ){ 005087 return SQLITE_NOMEM_BKPT; 005088 }else{ 005089 pCur->aOverflow = aNew; 005090 } 005091 } 005092 memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno)); 005093 pCur->curFlags |= BTCF_ValidOvfl; 005094 }else{ 005095 /* If the overflow page-list cache has been allocated and the 005096 ** entry for the first required overflow page is valid, skip 005097 ** directly to it. 005098 */ 005099 if( pCur->aOverflow[offset/ovflSize] ){ 005100 iIdx = (offset/ovflSize); 005101 nextPage = pCur->aOverflow[iIdx]; 005102 offset = (offset%ovflSize); 005103 } 005104 } 005105 005106 assert( rc==SQLITE_OK && amt>0 ); 005107 while( nextPage ){ 005108 /* If required, populate the overflow page-list cache. */ 005109 if( nextPage > pBt->nPage ) return SQLITE_CORRUPT_BKPT; 005110 assert( pCur->aOverflow[iIdx]==0 005111 || pCur->aOverflow[iIdx]==nextPage 005112 || CORRUPT_DB ); 005113 pCur->aOverflow[iIdx] = nextPage; 005114 005115 if( offset>=ovflSize ){ 005116 /* The only reason to read this page is to obtain the page 005117 ** number for the next page in the overflow chain. The page 005118 ** data is not required. So first try to lookup the overflow 005119 ** page-list cache, if any, then fall back to the getOverflowPage() 005120 ** function. 005121 */ 005122 assert( pCur->curFlags & BTCF_ValidOvfl ); 005123 assert( pCur->pBtree->db==pBt->db ); 005124 if( pCur->aOverflow[iIdx+1] ){ 005125 nextPage = pCur->aOverflow[iIdx+1]; 005126 }else{ 005127 rc = getOverflowPage(pBt, nextPage, 0, &nextPage); 005128 } 005129 offset -= ovflSize; 005130 }else{ 005131 /* Need to read this page properly. It contains some of the 005132 ** range of data that is being read (eOp==0) or written (eOp!=0). 005133 */ 005134 int a = amt; 005135 if( a + offset > ovflSize ){ 005136 a = ovflSize - offset; 005137 } 005138 005139 #ifdef SQLITE_DIRECT_OVERFLOW_READ 005140 /* If all the following are true: 005141 ** 005142 ** 1) this is a read operation, and 005143 ** 2) data is required from the start of this overflow page, and 005144 ** 3) there are no dirty pages in the page-cache 005145 ** 4) the database is file-backed, and 005146 ** 5) the page is not in the WAL file 005147 ** 6) at least 4 bytes have already been read into the output buffer 005148 ** 005149 ** then data can be read directly from the database file into the 005150 ** output buffer, bypassing the page-cache altogether. This speeds 005151 ** up loading large records that span many overflow pages. 005152 */ 005153 if( eOp==0 /* (1) */ 005154 && offset==0 /* (2) */ 005155 && sqlite3PagerDirectReadOk(pBt->pPager, nextPage) /* (3,4,5) */ 005156 && &pBuf[-4]>=pBufStart /* (6) */ 005157 ){ 005158 sqlite3_file *fd = sqlite3PagerFile(pBt->pPager); 005159 u8 aSave[4]; 005160 u8 *aWrite = &pBuf[-4]; 005161 assert( aWrite>=pBufStart ); /* due to (6) */ 005162 memcpy(aSave, aWrite, 4); 005163 rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1)); 005164 nextPage = get4byte(aWrite); 005165 memcpy(aWrite, aSave, 4); 005166 }else 005167 #endif 005168 005169 { 005170 DbPage *pDbPage; 005171 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage, 005172 (eOp==0 ? PAGER_GET_READONLY : 0) 005173 ); 005174 if( rc==SQLITE_OK ){ 005175 aPayload = sqlite3PagerGetData(pDbPage); 005176 nextPage = get4byte(aPayload); 005177 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage); 005178 sqlite3PagerUnref(pDbPage); 005179 offset = 0; 005180 } 005181 } 005182 amt -= a; 005183 if( amt==0 ) return rc; 005184 pBuf += a; 005185 } 005186 if( rc ) break; 005187 iIdx++; 005188 } 005189 } 005190 005191 if( rc==SQLITE_OK && amt>0 ){ 005192 /* Overflow chain ends prematurely */ 005193 return SQLITE_CORRUPT_PAGE(pPage); 005194 } 005195 return rc; 005196 } 005197 005198 /* 005199 ** Read part of the payload for the row at which that cursor pCur is currently 005200 ** pointing. "amt" bytes will be transferred into pBuf[]. The transfer 005201 ** begins at "offset". 005202 ** 005203 ** pCur can be pointing to either a table or an index b-tree. 005204 ** If pointing to a table btree, then the content section is read. If 005205 ** pCur is pointing to an index b-tree then the key section is read. 005206 ** 005207 ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing 005208 ** to a valid row in the table. For sqlite3BtreePayloadChecked(), the 005209 ** cursor might be invalid or might need to be restored before being read. 005210 ** 005211 ** Return SQLITE_OK on success or an error code if anything goes 005212 ** wrong. An error is returned if "offset+amt" is larger than 005213 ** the available payload. 005214 */ 005215 int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 005216 assert( cursorHoldsMutex(pCur) ); 005217 assert( pCur->eState==CURSOR_VALID ); 005218 assert( pCur->iPage>=0 && pCur->pPage ); 005219 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0); 005220 } 005221 005222 /* 005223 ** This variant of sqlite3BtreePayload() works even if the cursor has not 005224 ** in the CURSOR_VALID state. It is only used by the sqlite3_blob_read() 005225 ** interface. 005226 */ 005227 #ifndef SQLITE_OMIT_INCRBLOB 005228 static SQLITE_NOINLINE int accessPayloadChecked( 005229 BtCursor *pCur, 005230 u32 offset, 005231 u32 amt, 005232 void *pBuf 005233 ){ 005234 int rc; 005235 if ( pCur->eState==CURSOR_INVALID ){ 005236 return SQLITE_ABORT; 005237 } 005238 assert( cursorOwnsBtShared(pCur) ); 005239 rc = btreeRestoreCursorPosition(pCur); 005240 return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0); 005241 } 005242 int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 005243 if( pCur->eState==CURSOR_VALID ){ 005244 assert( cursorOwnsBtShared(pCur) ); 005245 return accessPayload(pCur, offset, amt, pBuf, 0); 005246 }else{ 005247 return accessPayloadChecked(pCur, offset, amt, pBuf); 005248 } 005249 } 005250 #endif /* SQLITE_OMIT_INCRBLOB */ 005251 005252 /* 005253 ** Return a pointer to payload information from the entry that the 005254 ** pCur cursor is pointing to. The pointer is to the beginning of 005255 ** the key if index btrees (pPage->intKey==0) and is the data for 005256 ** table btrees (pPage->intKey==1). The number of bytes of available 005257 ** key/data is written into *pAmt. If *pAmt==0, then the value 005258 ** returned will not be a valid pointer. 005259 ** 005260 ** This routine is an optimization. It is common for the entire key 005261 ** and data to fit on the local page and for there to be no overflow 005262 ** pages. When that is so, this routine can be used to access the 005263 ** key and data without making a copy. If the key and/or data spills 005264 ** onto overflow pages, then accessPayload() must be used to reassemble 005265 ** the key/data and copy it into a preallocated buffer. 005266 ** 005267 ** The pointer returned by this routine looks directly into the cached 005268 ** page of the database. The data might change or move the next time 005269 ** any btree routine is called. 005270 */ 005271 static const void *fetchPayload( 005272 BtCursor *pCur, /* Cursor pointing to entry to read from */ 005273 u32 *pAmt /* Write the number of available bytes here */ 005274 ){ 005275 int amt; 005276 assert( pCur!=0 && pCur->iPage>=0 && pCur->pPage); 005277 assert( pCur->eState==CURSOR_VALID ); 005278 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005279 assert( cursorOwnsBtShared(pCur) ); 005280 assert( pCur->ix<pCur->pPage->nCell || CORRUPT_DB ); 005281 assert( pCur->info.nSize>0 ); 005282 assert( pCur->info.pPayload>pCur->pPage->aData || CORRUPT_DB ); 005283 assert( pCur->info.pPayload<pCur->pPage->aDataEnd ||CORRUPT_DB); 005284 amt = pCur->info.nLocal; 005285 if( amt>(int)(pCur->pPage->aDataEnd - pCur->info.pPayload) ){ 005286 /* There is too little space on the page for the expected amount 005287 ** of local content. Database must be corrupt. */ 005288 assert( CORRUPT_DB ); 005289 amt = MAX(0, (int)(pCur->pPage->aDataEnd - pCur->info.pPayload)); 005290 } 005291 *pAmt = (u32)amt; 005292 return (void*)pCur->info.pPayload; 005293 } 005294 005295 005296 /* 005297 ** For the entry that cursor pCur is point to, return as 005298 ** many bytes of the key or data as are available on the local 005299 ** b-tree page. Write the number of available bytes into *pAmt. 005300 ** 005301 ** The pointer returned is ephemeral. The key/data may move 005302 ** or be destroyed on the next call to any Btree routine, 005303 ** including calls from other threads against the same cache. 005304 ** Hence, a mutex on the BtShared should be held prior to calling 005305 ** this routine. 005306 ** 005307 ** These routines is used to get quick access to key and data 005308 ** in the common case where no overflow pages are used. 005309 */ 005310 const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){ 005311 return fetchPayload(pCur, pAmt); 005312 } 005313 005314 005315 /* 005316 ** Move the cursor down to a new child page. The newPgno argument is the 005317 ** page number of the child page to move to. 005318 ** 005319 ** This function returns SQLITE_CORRUPT if the page-header flags field of 005320 ** the new child page does not match the flags field of the parent (i.e. 005321 ** if an intkey page appears to be the parent of a non-intkey page, or 005322 ** vice-versa). 005323 */ 005324 static int moveToChild(BtCursor *pCur, u32 newPgno){ 005325 int rc; 005326 assert( cursorOwnsBtShared(pCur) ); 005327 assert( pCur->eState==CURSOR_VALID ); 005328 assert( pCur->iPage<BTCURSOR_MAX_DEPTH ); 005329 assert( pCur->iPage>=0 ); 005330 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ 005331 return SQLITE_CORRUPT_BKPT; 005332 } 005333 pCur->info.nSize = 0; 005334 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 005335 pCur->aiIdx[pCur->iPage] = pCur->ix; 005336 pCur->apPage[pCur->iPage] = pCur->pPage; 005337 pCur->ix = 0; 005338 pCur->iPage++; 005339 rc = getAndInitPage(pCur->pBt, newPgno, &pCur->pPage, pCur->curPagerFlags); 005340 assert( pCur->pPage!=0 || rc!=SQLITE_OK ); 005341 if( rc==SQLITE_OK 005342 && (pCur->pPage->nCell<1 || pCur->pPage->intKey!=pCur->curIntKey) 005343 ){ 005344 releasePage(pCur->pPage); 005345 rc = SQLITE_CORRUPT_PGNO(newPgno); 005346 } 005347 if( rc ){ 005348 pCur->pPage = pCur->apPage[--pCur->iPage]; 005349 } 005350 return rc; 005351 } 005352 005353 #ifdef SQLITE_DEBUG 005354 /* 005355 ** Page pParent is an internal (non-leaf) tree page. This function 005356 ** asserts that page number iChild is the left-child if the iIdx'th 005357 ** cell in page pParent. Or, if iIdx is equal to the total number of 005358 ** cells in pParent, that page number iChild is the right-child of 005359 ** the page. 005360 */ 005361 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){ 005362 if( CORRUPT_DB ) return; /* The conditions tested below might not be true 005363 ** in a corrupt database */ 005364 assert( iIdx<=pParent->nCell ); 005365 if( iIdx==pParent->nCell ){ 005366 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild ); 005367 }else{ 005368 assert( get4byte(findCell(pParent, iIdx))==iChild ); 005369 } 005370 } 005371 #else 005372 # define assertParentIndex(x,y,z) 005373 #endif 005374 005375 /* 005376 ** Move the cursor up to the parent page. 005377 ** 005378 ** pCur->idx is set to the cell index that contains the pointer 005379 ** to the page we are coming from. If we are coming from the 005380 ** right-most child page then pCur->idx is set to one more than 005381 ** the largest cell index. 005382 */ 005383 static void moveToParent(BtCursor *pCur){ 005384 MemPage *pLeaf; 005385 assert( cursorOwnsBtShared(pCur) ); 005386 assert( pCur->eState==CURSOR_VALID ); 005387 assert( pCur->iPage>0 ); 005388 assert( pCur->pPage ); 005389 assertParentIndex( 005390 pCur->apPage[pCur->iPage-1], 005391 pCur->aiIdx[pCur->iPage-1], 005392 pCur->pPage->pgno 005393 ); 005394 testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell ); 005395 pCur->info.nSize = 0; 005396 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 005397 pCur->ix = pCur->aiIdx[pCur->iPage-1]; 005398 pLeaf = pCur->pPage; 005399 pCur->pPage = pCur->apPage[--pCur->iPage]; 005400 releasePageNotNull(pLeaf); 005401 } 005402 005403 /* 005404 ** Move the cursor to point to the root page of its b-tree structure. 005405 ** 005406 ** If the table has a virtual root page, then the cursor is moved to point 005407 ** to the virtual root page instead of the actual root page. A table has a 005408 ** virtual root page when the actual root page contains no cells and a 005409 ** single child page. This can only happen with the table rooted at page 1. 005410 ** 005411 ** If the b-tree structure is empty, the cursor state is set to 005412 ** CURSOR_INVALID and this routine returns SQLITE_EMPTY. Otherwise, 005413 ** the cursor is set to point to the first cell located on the root 005414 ** (or virtual root) page and the cursor state is set to CURSOR_VALID. 005415 ** 005416 ** If this function returns successfully, it may be assumed that the 005417 ** page-header flags indicate that the [virtual] root-page is the expected 005418 ** kind of b-tree page (i.e. if when opening the cursor the caller did not 005419 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D, 005420 ** indicating a table b-tree, or if the caller did specify a KeyInfo 005421 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index 005422 ** b-tree). 005423 */ 005424 static int moveToRoot(BtCursor *pCur){ 005425 MemPage *pRoot; 005426 int rc = SQLITE_OK; 005427 005428 assert( cursorOwnsBtShared(pCur) ); 005429 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK ); 005430 assert( CURSOR_VALID < CURSOR_REQUIRESEEK ); 005431 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK ); 005432 assert( pCur->eState < CURSOR_REQUIRESEEK || pCur->iPage<0 ); 005433 assert( pCur->pgnoRoot>0 || pCur->iPage<0 ); 005434 005435 if( pCur->iPage>=0 ){ 005436 if( pCur->iPage ){ 005437 releasePageNotNull(pCur->pPage); 005438 while( --pCur->iPage ){ 005439 releasePageNotNull(pCur->apPage[pCur->iPage]); 005440 } 005441 pRoot = pCur->pPage = pCur->apPage[0]; 005442 goto skip_init; 005443 } 005444 }else if( pCur->pgnoRoot==0 ){ 005445 pCur->eState = CURSOR_INVALID; 005446 return SQLITE_EMPTY; 005447 }else{ 005448 assert( pCur->iPage==(-1) ); 005449 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 005450 if( pCur->eState==CURSOR_FAULT ){ 005451 assert( pCur->skipNext!=SQLITE_OK ); 005452 return pCur->skipNext; 005453 } 005454 sqlite3BtreeClearCursor(pCur); 005455 } 005456 rc = getAndInitPage(pCur->pBt, pCur->pgnoRoot, &pCur->pPage, 005457 pCur->curPagerFlags); 005458 if( rc!=SQLITE_OK ){ 005459 pCur->eState = CURSOR_INVALID; 005460 return rc; 005461 } 005462 pCur->iPage = 0; 005463 pCur->curIntKey = pCur->pPage->intKey; 005464 } 005465 pRoot = pCur->pPage; 005466 assert( pRoot->pgno==pCur->pgnoRoot || CORRUPT_DB ); 005467 005468 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor 005469 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is 005470 ** NULL, the caller expects a table b-tree. If this is not the case, 005471 ** return an SQLITE_CORRUPT error. 005472 ** 005473 ** Earlier versions of SQLite assumed that this test could not fail 005474 ** if the root page was already loaded when this function was called (i.e. 005475 ** if pCur->iPage>=0). But this is not so if the database is corrupted 005476 ** in such a way that page pRoot is linked into a second b-tree table 005477 ** (or the freelist). */ 005478 assert( pRoot->intKey==1 || pRoot->intKey==0 ); 005479 if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){ 005480 return SQLITE_CORRUPT_PAGE(pCur->pPage); 005481 } 005482 005483 skip_init: 005484 pCur->ix = 0; 005485 pCur->info.nSize = 0; 005486 pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl); 005487 005488 if( pRoot->nCell>0 ){ 005489 pCur->eState = CURSOR_VALID; 005490 }else if( !pRoot->leaf ){ 005491 Pgno subpage; 005492 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT; 005493 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]); 005494 pCur->eState = CURSOR_VALID; 005495 rc = moveToChild(pCur, subpage); 005496 }else{ 005497 pCur->eState = CURSOR_INVALID; 005498 rc = SQLITE_EMPTY; 005499 } 005500 return rc; 005501 } 005502 005503 /* 005504 ** Move the cursor down to the left-most leaf entry beneath the 005505 ** entry to which it is currently pointing. 005506 ** 005507 ** The left-most leaf is the one with the smallest key - the first 005508 ** in ascending order. 005509 */ 005510 static int moveToLeftmost(BtCursor *pCur){ 005511 Pgno pgno; 005512 int rc = SQLITE_OK; 005513 MemPage *pPage; 005514 005515 assert( cursorOwnsBtShared(pCur) ); 005516 assert( pCur->eState==CURSOR_VALID ); 005517 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){ 005518 assert( pCur->ix<pPage->nCell ); 005519 pgno = get4byte(findCell(pPage, pCur->ix)); 005520 rc = moveToChild(pCur, pgno); 005521 } 005522 return rc; 005523 } 005524 005525 /* 005526 ** Move the cursor down to the right-most leaf entry beneath the 005527 ** page to which it is currently pointing. Notice the difference 005528 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost() 005529 ** finds the left-most entry beneath the *entry* whereas moveToRightmost() 005530 ** finds the right-most entry beneath the *page*. 005531 ** 005532 ** The right-most entry is the one with the largest key - the last 005533 ** key in ascending order. 005534 */ 005535 static int moveToRightmost(BtCursor *pCur){ 005536 Pgno pgno; 005537 int rc = SQLITE_OK; 005538 MemPage *pPage = 0; 005539 005540 assert( cursorOwnsBtShared(pCur) ); 005541 assert( pCur->eState==CURSOR_VALID ); 005542 while( !(pPage = pCur->pPage)->leaf ){ 005543 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 005544 pCur->ix = pPage->nCell; 005545 rc = moveToChild(pCur, pgno); 005546 if( rc ) return rc; 005547 } 005548 pCur->ix = pPage->nCell-1; 005549 assert( pCur->info.nSize==0 ); 005550 assert( (pCur->curFlags & BTCF_ValidNKey)==0 ); 005551 return SQLITE_OK; 005552 } 005553 005554 /* Move the cursor to the first entry in the table. Return SQLITE_OK 005555 ** on success. Set *pRes to 0 if the cursor actually points to something 005556 ** or set *pRes to 1 if the table is empty. 005557 */ 005558 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ 005559 int rc; 005560 005561 assert( cursorOwnsBtShared(pCur) ); 005562 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005563 rc = moveToRoot(pCur); 005564 if( rc==SQLITE_OK ){ 005565 assert( pCur->pPage->nCell>0 ); 005566 *pRes = 0; 005567 rc = moveToLeftmost(pCur); 005568 }else if( rc==SQLITE_EMPTY ){ 005569 assert( pCur->pgnoRoot==0 || (pCur->pPage!=0 && pCur->pPage->nCell==0) ); 005570 *pRes = 1; 005571 rc = SQLITE_OK; 005572 } 005573 return rc; 005574 } 005575 005576 /* Move the cursor to the last entry in the table. Return SQLITE_OK 005577 ** on success. Set *pRes to 0 if the cursor actually points to something 005578 ** or set *pRes to 1 if the table is empty. 005579 */ 005580 static SQLITE_NOINLINE int btreeLast(BtCursor *pCur, int *pRes){ 005581 int rc = moveToRoot(pCur); 005582 if( rc==SQLITE_OK ){ 005583 assert( pCur->eState==CURSOR_VALID ); 005584 *pRes = 0; 005585 rc = moveToRightmost(pCur); 005586 if( rc==SQLITE_OK ){ 005587 pCur->curFlags |= BTCF_AtLast; 005588 }else{ 005589 pCur->curFlags &= ~BTCF_AtLast; 005590 } 005591 }else if( rc==SQLITE_EMPTY ){ 005592 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); 005593 *pRes = 1; 005594 rc = SQLITE_OK; 005595 } 005596 return rc; 005597 } 005598 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ 005599 assert( cursorOwnsBtShared(pCur) ); 005600 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005601 005602 /* If the cursor already points to the last entry, this is a no-op. */ 005603 if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){ 005604 #ifdef SQLITE_DEBUG 005605 /* This block serves to assert() that the cursor really does point 005606 ** to the last entry in the b-tree. */ 005607 int ii; 005608 for(ii=0; ii<pCur->iPage; ii++){ 005609 assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell ); 005610 } 005611 assert( pCur->ix==pCur->pPage->nCell-1 || CORRUPT_DB ); 005612 testcase( pCur->ix!=pCur->pPage->nCell-1 ); 005613 /* ^-- dbsqlfuzz b92b72e4de80b5140c30ab71372ca719b8feb618 */ 005614 assert( pCur->pPage->leaf ); 005615 #endif 005616 *pRes = 0; 005617 return SQLITE_OK; 005618 } 005619 return btreeLast(pCur, pRes); 005620 } 005621 005622 /* Move the cursor so that it points to an entry in a table (a.k.a INTKEY) 005623 ** table near the key intKey. Return a success code. 005624 ** 005625 ** If an exact match is not found, then the cursor is always 005626 ** left pointing at a leaf page which would hold the entry if it 005627 ** were present. The cursor might point to an entry that comes 005628 ** before or after the key. 005629 ** 005630 ** An integer is written into *pRes which is the result of 005631 ** comparing the key with the entry to which the cursor is 005632 ** pointing. The meaning of the integer written into 005633 ** *pRes is as follows: 005634 ** 005635 ** *pRes<0 The cursor is left pointing at an entry that 005636 ** is smaller than intKey or if the table is empty 005637 ** and the cursor is therefore left point to nothing. 005638 ** 005639 ** *pRes==0 The cursor is left pointing at an entry that 005640 ** exactly matches intKey. 005641 ** 005642 ** *pRes>0 The cursor is left pointing at an entry that 005643 ** is larger than intKey. 005644 */ 005645 int sqlite3BtreeTableMoveto( 005646 BtCursor *pCur, /* The cursor to be moved */ 005647 i64 intKey, /* The table key */ 005648 int biasRight, /* If true, bias the search to the high end */ 005649 int *pRes /* Write search results here */ 005650 ){ 005651 int rc; 005652 005653 assert( cursorOwnsBtShared(pCur) ); 005654 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005655 assert( pRes ); 005656 assert( pCur->pKeyInfo==0 ); 005657 assert( pCur->eState!=CURSOR_VALID || pCur->curIntKey!=0 ); 005658 005659 /* If the cursor is already positioned at the point we are trying 005660 ** to move to, then just return without doing any work */ 005661 if( pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0 ){ 005662 if( pCur->info.nKey==intKey ){ 005663 *pRes = 0; 005664 return SQLITE_OK; 005665 } 005666 if( pCur->info.nKey<intKey ){ 005667 if( (pCur->curFlags & BTCF_AtLast)!=0 ){ 005668 *pRes = -1; 005669 return SQLITE_OK; 005670 } 005671 /* If the requested key is one more than the previous key, then 005672 ** try to get there using sqlite3BtreeNext() rather than a full 005673 ** binary search. This is an optimization only. The correct answer 005674 ** is still obtained without this case, only a little more slowly. */ 005675 if( pCur->info.nKey+1==intKey ){ 005676 *pRes = 0; 005677 rc = sqlite3BtreeNext(pCur, 0); 005678 if( rc==SQLITE_OK ){ 005679 getCellInfo(pCur); 005680 if( pCur->info.nKey==intKey ){ 005681 return SQLITE_OK; 005682 } 005683 }else if( rc!=SQLITE_DONE ){ 005684 return rc; 005685 } 005686 } 005687 } 005688 } 005689 005690 #ifdef SQLITE_DEBUG 005691 pCur->pBtree->nSeek++; /* Performance measurement during testing */ 005692 #endif 005693 005694 rc = moveToRoot(pCur); 005695 if( rc ){ 005696 if( rc==SQLITE_EMPTY ){ 005697 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); 005698 *pRes = -1; 005699 return SQLITE_OK; 005700 } 005701 return rc; 005702 } 005703 assert( pCur->pPage ); 005704 assert( pCur->pPage->isInit ); 005705 assert( pCur->eState==CURSOR_VALID ); 005706 assert( pCur->pPage->nCell > 0 ); 005707 assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey ); 005708 assert( pCur->curIntKey ); 005709 005710 for(;;){ 005711 int lwr, upr, idx, c; 005712 Pgno chldPg; 005713 MemPage *pPage = pCur->pPage; 005714 u8 *pCell; /* Pointer to current cell in pPage */ 005715 005716 /* pPage->nCell must be greater than zero. If this is the root-page 005717 ** the cursor would have been INVALID above and this for(;;) loop 005718 ** not run. If this is not the root-page, then the moveToChild() routine 005719 ** would have already detected db corruption. Similarly, pPage must 005720 ** be the right kind (index or table) of b-tree page. Otherwise 005721 ** a moveToChild() or moveToRoot() call would have detected corruption. */ 005722 assert( pPage->nCell>0 ); 005723 assert( pPage->intKey ); 005724 lwr = 0; 005725 upr = pPage->nCell-1; 005726 assert( biasRight==0 || biasRight==1 ); 005727 idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */ 005728 for(;;){ 005729 i64 nCellKey; 005730 pCell = findCellPastPtr(pPage, idx); 005731 if( pPage->intKeyLeaf ){ 005732 while( 0x80 <= *(pCell++) ){ 005733 if( pCell>=pPage->aDataEnd ){ 005734 return SQLITE_CORRUPT_PAGE(pPage); 005735 } 005736 } 005737 } 005738 getVarint(pCell, (u64*)&nCellKey); 005739 if( nCellKey<intKey ){ 005740 lwr = idx+1; 005741 if( lwr>upr ){ c = -1; break; } 005742 }else if( nCellKey>intKey ){ 005743 upr = idx-1; 005744 if( lwr>upr ){ c = +1; break; } 005745 }else{ 005746 assert( nCellKey==intKey ); 005747 pCur->ix = (u16)idx; 005748 if( !pPage->leaf ){ 005749 lwr = idx; 005750 goto moveto_table_next_layer; 005751 }else{ 005752 pCur->curFlags |= BTCF_ValidNKey; 005753 pCur->info.nKey = nCellKey; 005754 pCur->info.nSize = 0; 005755 *pRes = 0; 005756 return SQLITE_OK; 005757 } 005758 } 005759 assert( lwr+upr>=0 ); 005760 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2; */ 005761 } 005762 assert( lwr==upr+1 || !pPage->leaf ); 005763 assert( pPage->isInit ); 005764 if( pPage->leaf ){ 005765 assert( pCur->ix<pCur->pPage->nCell ); 005766 pCur->ix = (u16)idx; 005767 *pRes = c; 005768 rc = SQLITE_OK; 005769 goto moveto_table_finish; 005770 } 005771 moveto_table_next_layer: 005772 if( lwr>=pPage->nCell ){ 005773 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 005774 }else{ 005775 chldPg = get4byte(findCell(pPage, lwr)); 005776 } 005777 pCur->ix = (u16)lwr; 005778 rc = moveToChild(pCur, chldPg); 005779 if( rc ) break; 005780 } 005781 moveto_table_finish: 005782 pCur->info.nSize = 0; 005783 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); 005784 return rc; 005785 } 005786 005787 /* 005788 ** Compare the "idx"-th cell on the page the cursor pCur is currently 005789 ** pointing to to pIdxKey using xRecordCompare. Return negative or 005790 ** zero if the cell is less than or equal pIdxKey. Return positive 005791 ** if unknown. 005792 ** 005793 ** Return value negative: Cell at pCur[idx] less than pIdxKey 005794 ** 005795 ** Return value is zero: Cell at pCur[idx] equals pIdxKey 005796 ** 005797 ** Return value positive: Nothing is known about the relationship 005798 ** of the cell at pCur[idx] and pIdxKey. 005799 ** 005800 ** This routine is part of an optimization. It is always safe to return 005801 ** a positive value as that will cause the optimization to be skipped. 005802 */ 005803 static int indexCellCompare( 005804 BtCursor *pCur, 005805 int idx, 005806 UnpackedRecord *pIdxKey, 005807 RecordCompare xRecordCompare 005808 ){ 005809 MemPage *pPage = pCur->pPage; 005810 int c; 005811 int nCell; /* Size of the pCell cell in bytes */ 005812 u8 *pCell = findCellPastPtr(pPage, idx); 005813 005814 nCell = pCell[0]; 005815 if( nCell<=pPage->max1bytePayload ){ 005816 /* This branch runs if the record-size field of the cell is a 005817 ** single byte varint and the record fits entirely on the main 005818 ** b-tree page. */ 005819 testcase( pCell+nCell+1==pPage->aDataEnd ); 005820 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey); 005821 }else if( !(pCell[1] & 0x80) 005822 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal 005823 ){ 005824 /* The record-size field is a 2 byte varint and the record 005825 ** fits entirely on the main b-tree page. */ 005826 testcase( pCell+nCell+2==pPage->aDataEnd ); 005827 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey); 005828 }else{ 005829 /* If the record extends into overflow pages, do not attempt 005830 ** the optimization. */ 005831 c = 99; 005832 } 005833 return c; 005834 } 005835 005836 /* 005837 ** Return true (non-zero) if pCur is current pointing to the last 005838 ** page of a table. 005839 */ 005840 static int cursorOnLastPage(BtCursor *pCur){ 005841 int i; 005842 assert( pCur->eState==CURSOR_VALID ); 005843 for(i=0; i<pCur->iPage; i++){ 005844 MemPage *pPage = pCur->apPage[i]; 005845 if( pCur->aiIdx[i]<pPage->nCell ) return 0; 005846 } 005847 return 1; 005848 } 005849 005850 /* Move the cursor so that it points to an entry in an index table 005851 ** near the key pIdxKey. Return a success code. 005852 ** 005853 ** If an exact match is not found, then the cursor is always 005854 ** left pointing at a leaf page which would hold the entry if it 005855 ** were present. The cursor might point to an entry that comes 005856 ** before or after the key. 005857 ** 005858 ** An integer is written into *pRes which is the result of 005859 ** comparing the key with the entry to which the cursor is 005860 ** pointing. The meaning of the integer written into 005861 ** *pRes is as follows: 005862 ** 005863 ** *pRes<0 The cursor is left pointing at an entry that 005864 ** is smaller than pIdxKey or if the table is empty 005865 ** and the cursor is therefore left point to nothing. 005866 ** 005867 ** *pRes==0 The cursor is left pointing at an entry that 005868 ** exactly matches pIdxKey. 005869 ** 005870 ** *pRes>0 The cursor is left pointing at an entry that 005871 ** is larger than pIdxKey. 005872 ** 005873 ** The pIdxKey->eqSeen field is set to 1 if there 005874 ** exists an entry in the table that exactly matches pIdxKey. 005875 */ 005876 int sqlite3BtreeIndexMoveto( 005877 BtCursor *pCur, /* The cursor to be moved */ 005878 UnpackedRecord *pIdxKey, /* Unpacked index key */ 005879 int *pRes /* Write search results here */ 005880 ){ 005881 int rc; 005882 RecordCompare xRecordCompare; 005883 005884 assert( cursorOwnsBtShared(pCur) ); 005885 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005886 assert( pRes ); 005887 assert( pCur->pKeyInfo!=0 ); 005888 005889 #ifdef SQLITE_DEBUG 005890 pCur->pBtree->nSeek++; /* Performance measurement during testing */ 005891 #endif 005892 005893 xRecordCompare = sqlite3VdbeFindCompare(pIdxKey); 005894 pIdxKey->errCode = 0; 005895 assert( pIdxKey->default_rc==1 005896 || pIdxKey->default_rc==0 005897 || pIdxKey->default_rc==-1 005898 ); 005899 005900 005901 /* Check to see if we can skip a lot of work. Two cases: 005902 ** 005903 ** (1) If the cursor is already pointing to the very last cell 005904 ** in the table and the pIdxKey search key is greater than or 005905 ** equal to that last cell, then no movement is required. 005906 ** 005907 ** (2) If the cursor is on the last page of the table and the first 005908 ** cell on that last page is less than or equal to the pIdxKey 005909 ** search key, then we can start the search on the current page 005910 ** without needing to go back to root. 005911 */ 005912 if( pCur->eState==CURSOR_VALID 005913 && pCur->pPage->leaf 005914 && cursorOnLastPage(pCur) 005915 ){ 005916 int c; 005917 if( pCur->ix==pCur->pPage->nCell-1 005918 && (c = indexCellCompare(pCur, pCur->ix, pIdxKey, xRecordCompare))<=0 005919 && pIdxKey->errCode==SQLITE_OK 005920 ){ 005921 *pRes = c; 005922 return SQLITE_OK; /* Cursor already pointing at the correct spot */ 005923 } 005924 if( pCur->iPage>0 005925 && indexCellCompare(pCur, 0, pIdxKey, xRecordCompare)<=0 005926 && pIdxKey->errCode==SQLITE_OK 005927 ){ 005928 pCur->curFlags &= ~BTCF_ValidOvfl; 005929 if( !pCur->pPage->isInit ){ 005930 return SQLITE_CORRUPT_BKPT; 005931 } 005932 goto bypass_moveto_root; /* Start search on the current page */ 005933 } 005934 pIdxKey->errCode = SQLITE_OK; 005935 } 005936 005937 rc = moveToRoot(pCur); 005938 if( rc ){ 005939 if( rc==SQLITE_EMPTY ){ 005940 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); 005941 *pRes = -1; 005942 return SQLITE_OK; 005943 } 005944 return rc; 005945 } 005946 005947 bypass_moveto_root: 005948 assert( pCur->pPage ); 005949 assert( pCur->pPage->isInit ); 005950 assert( pCur->eState==CURSOR_VALID ); 005951 assert( pCur->pPage->nCell > 0 ); 005952 assert( pCur->curIntKey==0 ); 005953 assert( pIdxKey!=0 ); 005954 for(;;){ 005955 int lwr, upr, idx, c; 005956 Pgno chldPg; 005957 MemPage *pPage = pCur->pPage; 005958 u8 *pCell; /* Pointer to current cell in pPage */ 005959 005960 /* pPage->nCell must be greater than zero. If this is the root-page 005961 ** the cursor would have been INVALID above and this for(;;) loop 005962 ** not run. If this is not the root-page, then the moveToChild() routine 005963 ** would have already detected db corruption. Similarly, pPage must 005964 ** be the right kind (index or table) of b-tree page. Otherwise 005965 ** a moveToChild() or moveToRoot() call would have detected corruption. */ 005966 assert( pPage->nCell>0 ); 005967 assert( pPage->intKey==0 ); 005968 lwr = 0; 005969 upr = pPage->nCell-1; 005970 idx = upr>>1; /* idx = (lwr+upr)/2; */ 005971 for(;;){ 005972 int nCell; /* Size of the pCell cell in bytes */ 005973 pCell = findCellPastPtr(pPage, idx); 005974 005975 /* The maximum supported page-size is 65536 bytes. This means that 005976 ** the maximum number of record bytes stored on an index B-Tree 005977 ** page is less than 16384 bytes and may be stored as a 2-byte 005978 ** varint. This information is used to attempt to avoid parsing 005979 ** the entire cell by checking for the cases where the record is 005980 ** stored entirely within the b-tree page by inspecting the first 005981 ** 2 bytes of the cell. 005982 */ 005983 nCell = pCell[0]; 005984 if( nCell<=pPage->max1bytePayload ){ 005985 /* This branch runs if the record-size field of the cell is a 005986 ** single byte varint and the record fits entirely on the main 005987 ** b-tree page. */ 005988 testcase( pCell+nCell+1==pPage->aDataEnd ); 005989 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey); 005990 }else if( !(pCell[1] & 0x80) 005991 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal 005992 ){ 005993 /* The record-size field is a 2 byte varint and the record 005994 ** fits entirely on the main b-tree page. */ 005995 testcase( pCell+nCell+2==pPage->aDataEnd ); 005996 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey); 005997 }else{ 005998 /* The record flows over onto one or more overflow pages. In 005999 ** this case the whole cell needs to be parsed, a buffer allocated 006000 ** and accessPayload() used to retrieve the record into the 006001 ** buffer before VdbeRecordCompare() can be called. 006002 ** 006003 ** If the record is corrupt, the xRecordCompare routine may read 006004 ** up to two varints past the end of the buffer. An extra 18 006005 ** bytes of padding is allocated at the end of the buffer in 006006 ** case this happens. */ 006007 void *pCellKey; 006008 u8 * const pCellBody = pCell - pPage->childPtrSize; 006009 const int nOverrun = 18; /* Size of the overrun padding */ 006010 pPage->xParseCell(pPage, pCellBody, &pCur->info); 006011 nCell = (int)pCur->info.nKey; 006012 testcase( nCell<0 ); /* True if key size is 2^32 or more */ 006013 testcase( nCell==0 ); /* Invalid key size: 0x80 0x80 0x00 */ 006014 testcase( nCell==1 ); /* Invalid key size: 0x80 0x80 0x01 */ 006015 testcase( nCell==2 ); /* Minimum legal index key size */ 006016 if( nCell<2 || nCell/pCur->pBt->usableSize>pCur->pBt->nPage ){ 006017 rc = SQLITE_CORRUPT_PAGE(pPage); 006018 goto moveto_index_finish; 006019 } 006020 pCellKey = sqlite3Malloc( nCell+nOverrun ); 006021 if( pCellKey==0 ){ 006022 rc = SQLITE_NOMEM_BKPT; 006023 goto moveto_index_finish; 006024 } 006025 pCur->ix = (u16)idx; 006026 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0); 006027 memset(((u8*)pCellKey)+nCell,0,nOverrun); /* Fix uninit warnings */ 006028 pCur->curFlags &= ~BTCF_ValidOvfl; 006029 if( rc ){ 006030 sqlite3_free(pCellKey); 006031 goto moveto_index_finish; 006032 } 006033 c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey); 006034 sqlite3_free(pCellKey); 006035 } 006036 assert( 006037 (pIdxKey->errCode!=SQLITE_CORRUPT || c==0) 006038 && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed) 006039 ); 006040 if( c<0 ){ 006041 lwr = idx+1; 006042 }else if( c>0 ){ 006043 upr = idx-1; 006044 }else{ 006045 assert( c==0 ); 006046 *pRes = 0; 006047 rc = SQLITE_OK; 006048 pCur->ix = (u16)idx; 006049 if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT; 006050 goto moveto_index_finish; 006051 } 006052 if( lwr>upr ) break; 006053 assert( lwr+upr>=0 ); 006054 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2 */ 006055 } 006056 assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) ); 006057 assert( pPage->isInit ); 006058 if( pPage->leaf ){ 006059 assert( pCur->ix<pCur->pPage->nCell || CORRUPT_DB ); 006060 pCur->ix = (u16)idx; 006061 *pRes = c; 006062 rc = SQLITE_OK; 006063 goto moveto_index_finish; 006064 } 006065 if( lwr>=pPage->nCell ){ 006066 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 006067 }else{ 006068 chldPg = get4byte(findCell(pPage, lwr)); 006069 } 006070 006071 /* This block is similar to an in-lined version of: 006072 ** 006073 ** pCur->ix = (u16)lwr; 006074 ** rc = moveToChild(pCur, chldPg); 006075 ** if( rc ) break; 006076 */ 006077 pCur->info.nSize = 0; 006078 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 006079 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ 006080 return SQLITE_CORRUPT_BKPT; 006081 } 006082 pCur->aiIdx[pCur->iPage] = (u16)lwr; 006083 pCur->apPage[pCur->iPage] = pCur->pPage; 006084 pCur->ix = 0; 006085 pCur->iPage++; 006086 rc = getAndInitPage(pCur->pBt, chldPg, &pCur->pPage, pCur->curPagerFlags); 006087 if( rc==SQLITE_OK 006088 && (pCur->pPage->nCell<1 || pCur->pPage->intKey!=pCur->curIntKey) 006089 ){ 006090 releasePage(pCur->pPage); 006091 rc = SQLITE_CORRUPT_PGNO(chldPg); 006092 } 006093 if( rc ){ 006094 pCur->pPage = pCur->apPage[--pCur->iPage]; 006095 break; 006096 } 006097 /* 006098 ***** End of in-lined moveToChild() call */ 006099 } 006100 moveto_index_finish: 006101 pCur->info.nSize = 0; 006102 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); 006103 return rc; 006104 } 006105 006106 006107 /* 006108 ** Return TRUE if the cursor is not pointing at an entry of the table. 006109 ** 006110 ** TRUE will be returned after a call to sqlite3BtreeNext() moves 006111 ** past the last entry in the table or sqlite3BtreePrev() moves past 006112 ** the first entry. TRUE is also returned if the table is empty. 006113 */ 006114 int sqlite3BtreeEof(BtCursor *pCur){ 006115 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries 006116 ** have been deleted? This API will need to change to return an error code 006117 ** as well as the boolean result value. 006118 */ 006119 return (CURSOR_VALID!=pCur->eState); 006120 } 006121 006122 /* 006123 ** Return an estimate for the number of rows in the table that pCur is 006124 ** pointing to. Return a negative number if no estimate is currently 006125 ** available. 006126 */ 006127 i64 sqlite3BtreeRowCountEst(BtCursor *pCur){ 006128 i64 n; 006129 u8 i; 006130 006131 assert( cursorOwnsBtShared(pCur) ); 006132 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 006133 006134 /* Currently this interface is only called by the OP_IfSmaller 006135 ** opcode, and it that case the cursor will always be valid and 006136 ** will always point to a leaf node. */ 006137 if( NEVER(pCur->eState!=CURSOR_VALID) ) return -1; 006138 if( NEVER(pCur->pPage->leaf==0) ) return -1; 006139 006140 n = pCur->pPage->nCell; 006141 for(i=0; i<pCur->iPage; i++){ 006142 n *= pCur->apPage[i]->nCell; 006143 } 006144 return n; 006145 } 006146 006147 /* 006148 ** Advance the cursor to the next entry in the database. 006149 ** Return value: 006150 ** 006151 ** SQLITE_OK success 006152 ** SQLITE_DONE cursor is already pointing at the last element 006153 ** otherwise some kind of error occurred 006154 ** 006155 ** The main entry point is sqlite3BtreeNext(). That routine is optimized 006156 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx 006157 ** to the next cell on the current page. The (slower) btreeNext() helper 006158 ** routine is called when it is necessary to move to a different page or 006159 ** to restore the cursor. 006160 ** 006161 ** If bit 0x01 of the F argument in sqlite3BtreeNext(C,F) is 1, then the 006162 ** cursor corresponds to an SQL index and this routine could have been 006163 ** skipped if the SQL index had been a unique index. The F argument 006164 ** is a hint to the implement. SQLite btree implementation does not use 006165 ** this hint, but COMDB2 does. 006166 */ 006167 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){ 006168 int rc; 006169 int idx; 006170 MemPage *pPage; 006171 006172 assert( cursorOwnsBtShared(pCur) ); 006173 if( pCur->eState!=CURSOR_VALID ){ 006174 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); 006175 rc = restoreCursorPosition(pCur); 006176 if( rc!=SQLITE_OK ){ 006177 return rc; 006178 } 006179 if( CURSOR_INVALID==pCur->eState ){ 006180 return SQLITE_DONE; 006181 } 006182 if( pCur->eState==CURSOR_SKIPNEXT ){ 006183 pCur->eState = CURSOR_VALID; 006184 if( pCur->skipNext>0 ) return SQLITE_OK; 006185 } 006186 } 006187 006188 pPage = pCur->pPage; 006189 idx = ++pCur->ix; 006190 if( sqlite3FaultSim(412) ) pPage->isInit = 0; 006191 if( !pPage->isInit ){ 006192 return SQLITE_CORRUPT_BKPT; 006193 } 006194 006195 if( idx>=pPage->nCell ){ 006196 if( !pPage->leaf ){ 006197 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 006198 if( rc ) return rc; 006199 return moveToLeftmost(pCur); 006200 } 006201 do{ 006202 if( pCur->iPage==0 ){ 006203 pCur->eState = CURSOR_INVALID; 006204 return SQLITE_DONE; 006205 } 006206 moveToParent(pCur); 006207 pPage = pCur->pPage; 006208 }while( pCur->ix>=pPage->nCell ); 006209 if( pPage->intKey ){ 006210 return sqlite3BtreeNext(pCur, 0); 006211 }else{ 006212 return SQLITE_OK; 006213 } 006214 } 006215 if( pPage->leaf ){ 006216 return SQLITE_OK; 006217 }else{ 006218 return moveToLeftmost(pCur); 006219 } 006220 } 006221 int sqlite3BtreeNext(BtCursor *pCur, int flags){ 006222 MemPage *pPage; 006223 UNUSED_PARAMETER( flags ); /* Used in COMDB2 but not native SQLite */ 006224 assert( cursorOwnsBtShared(pCur) ); 006225 assert( flags==0 || flags==1 ); 006226 pCur->info.nSize = 0; 006227 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 006228 if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur); 006229 pPage = pCur->pPage; 006230 if( (++pCur->ix)>=pPage->nCell ){ 006231 pCur->ix--; 006232 return btreeNext(pCur); 006233 } 006234 if( pPage->leaf ){ 006235 return SQLITE_OK; 006236 }else{ 006237 return moveToLeftmost(pCur); 006238 } 006239 } 006240 006241 /* 006242 ** Step the cursor to the back to the previous entry in the database. 006243 ** Return values: 006244 ** 006245 ** SQLITE_OK success 006246 ** SQLITE_DONE the cursor is already on the first element of the table 006247 ** otherwise some kind of error occurred 006248 ** 006249 ** The main entry point is sqlite3BtreePrevious(). That routine is optimized 006250 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx 006251 ** to the previous cell on the current page. The (slower) btreePrevious() 006252 ** helper routine is called when it is necessary to move to a different page 006253 ** or to restore the cursor. 006254 ** 006255 ** If bit 0x01 of the F argument to sqlite3BtreePrevious(C,F) is 1, then 006256 ** the cursor corresponds to an SQL index and this routine could have been 006257 ** skipped if the SQL index had been a unique index. The F argument is a 006258 ** hint to the implement. The native SQLite btree implementation does not 006259 ** use this hint, but COMDB2 does. 006260 */ 006261 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur){ 006262 int rc; 006263 MemPage *pPage; 006264 006265 assert( cursorOwnsBtShared(pCur) ); 006266 assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 ); 006267 assert( pCur->info.nSize==0 ); 006268 if( pCur->eState!=CURSOR_VALID ){ 006269 rc = restoreCursorPosition(pCur); 006270 if( rc!=SQLITE_OK ){ 006271 return rc; 006272 } 006273 if( CURSOR_INVALID==pCur->eState ){ 006274 return SQLITE_DONE; 006275 } 006276 if( CURSOR_SKIPNEXT==pCur->eState ){ 006277 pCur->eState = CURSOR_VALID; 006278 if( pCur->skipNext<0 ) return SQLITE_OK; 006279 } 006280 } 006281 006282 pPage = pCur->pPage; 006283 if( sqlite3FaultSim(412) ) pPage->isInit = 0; 006284 if( !pPage->isInit ){ 006285 return SQLITE_CORRUPT_BKPT; 006286 } 006287 if( !pPage->leaf ){ 006288 int idx = pCur->ix; 006289 rc = moveToChild(pCur, get4byte(findCell(pPage, idx))); 006290 if( rc ) return rc; 006291 rc = moveToRightmost(pCur); 006292 }else{ 006293 while( pCur->ix==0 ){ 006294 if( pCur->iPage==0 ){ 006295 pCur->eState = CURSOR_INVALID; 006296 return SQLITE_DONE; 006297 } 006298 moveToParent(pCur); 006299 } 006300 assert( pCur->info.nSize==0 ); 006301 assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 ); 006302 006303 pCur->ix--; 006304 pPage = pCur->pPage; 006305 if( pPage->intKey && !pPage->leaf ){ 006306 rc = sqlite3BtreePrevious(pCur, 0); 006307 }else{ 006308 rc = SQLITE_OK; 006309 } 006310 } 006311 return rc; 006312 } 006313 int sqlite3BtreePrevious(BtCursor *pCur, int flags){ 006314 assert( cursorOwnsBtShared(pCur) ); 006315 assert( flags==0 || flags==1 ); 006316 UNUSED_PARAMETER( flags ); /* Used in COMDB2 but not native SQLite */ 006317 pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey); 006318 pCur->info.nSize = 0; 006319 if( pCur->eState!=CURSOR_VALID 006320 || pCur->ix==0 006321 || pCur->pPage->leaf==0 006322 ){ 006323 return btreePrevious(pCur); 006324 } 006325 pCur->ix--; 006326 return SQLITE_OK; 006327 } 006328 006329 /* 006330 ** Allocate a new page from the database file. 006331 ** 006332 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite() 006333 ** has already been called on the new page.) The new page has also 006334 ** been referenced and the calling routine is responsible for calling 006335 ** sqlite3PagerUnref() on the new page when it is done. 006336 ** 006337 ** SQLITE_OK is returned on success. Any other return value indicates 006338 ** an error. *ppPage is set to NULL in the event of an error. 006339 ** 006340 ** If the "nearby" parameter is not 0, then an effort is made to 006341 ** locate a page close to the page number "nearby". This can be used in an 006342 ** attempt to keep related pages close to each other in the database file, 006343 ** which in turn can make database access faster. 006344 ** 006345 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists 006346 ** anywhere on the free-list, then it is guaranteed to be returned. If 006347 ** eMode is BTALLOC_LT then the page returned will be less than or equal 006348 ** to nearby if any such page exists. If eMode is BTALLOC_ANY then there 006349 ** are no restrictions on which page is returned. 006350 */ 006351 static int allocateBtreePage( 006352 BtShared *pBt, /* The btree */ 006353 MemPage **ppPage, /* Store pointer to the allocated page here */ 006354 Pgno *pPgno, /* Store the page number here */ 006355 Pgno nearby, /* Search for a page near this one */ 006356 u8 eMode /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */ 006357 ){ 006358 MemPage *pPage1; 006359 int rc; 006360 u32 n; /* Number of pages on the freelist */ 006361 u32 k; /* Number of leaves on the trunk of the freelist */ 006362 MemPage *pTrunk = 0; 006363 MemPage *pPrevTrunk = 0; 006364 Pgno mxPage; /* Total size of the database file */ 006365 006366 assert( sqlite3_mutex_held(pBt->mutex) ); 006367 assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) ); 006368 pPage1 = pBt->pPage1; 006369 mxPage = btreePagecount(pBt); 006370 /* EVIDENCE-OF: R-21003-45125 The 4-byte big-endian integer at offset 36 006371 ** stores the total number of pages on the freelist. */ 006372 n = get4byte(&pPage1->aData[36]); 006373 testcase( n==mxPage-1 ); 006374 if( n>=mxPage ){ 006375 return SQLITE_CORRUPT_BKPT; 006376 } 006377 if( n>0 ){ 006378 /* There are pages on the freelist. Reuse one of those pages. */ 006379 Pgno iTrunk; 006380 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ 006381 u32 nSearch = 0; /* Count of the number of search attempts */ 006382 006383 /* If eMode==BTALLOC_EXACT and a query of the pointer-map 006384 ** shows that the page 'nearby' is somewhere on the free-list, then 006385 ** the entire-list will be searched for that page. 006386 */ 006387 #ifndef SQLITE_OMIT_AUTOVACUUM 006388 if( eMode==BTALLOC_EXACT ){ 006389 if( nearby<=mxPage ){ 006390 u8 eType; 006391 assert( nearby>0 ); 006392 assert( pBt->autoVacuum ); 006393 rc = ptrmapGet(pBt, nearby, &eType, 0); 006394 if( rc ) return rc; 006395 if( eType==PTRMAP_FREEPAGE ){ 006396 searchList = 1; 006397 } 006398 } 006399 }else if( eMode==BTALLOC_LE ){ 006400 searchList = 1; 006401 } 006402 #endif 006403 006404 /* Decrement the free-list count by 1. Set iTrunk to the index of the 006405 ** first free-list trunk page. iPrevTrunk is initially 1. 006406 */ 006407 rc = sqlite3PagerWrite(pPage1->pDbPage); 006408 if( rc ) return rc; 006409 put4byte(&pPage1->aData[36], n-1); 006410 006411 /* The code within this loop is run only once if the 'searchList' variable 006412 ** is not true. Otherwise, it runs once for each trunk-page on the 006413 ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT) 006414 ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT) 006415 */ 006416 do { 006417 pPrevTrunk = pTrunk; 006418 if( pPrevTrunk ){ 006419 /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page 006420 ** is the page number of the next freelist trunk page in the list or 006421 ** zero if this is the last freelist trunk page. */ 006422 iTrunk = get4byte(&pPrevTrunk->aData[0]); 006423 }else{ 006424 /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32 006425 ** stores the page number of the first page of the freelist, or zero if 006426 ** the freelist is empty. */ 006427 iTrunk = get4byte(&pPage1->aData[32]); 006428 } 006429 testcase( iTrunk==mxPage ); 006430 if( iTrunk>mxPage || nSearch++ > n ){ 006431 rc = SQLITE_CORRUPT_PGNO(pPrevTrunk ? pPrevTrunk->pgno : 1); 006432 }else{ 006433 rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0); 006434 } 006435 if( rc ){ 006436 pTrunk = 0; 006437 goto end_allocate_page; 006438 } 006439 assert( pTrunk!=0 ); 006440 assert( pTrunk->aData!=0 ); 006441 /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page 006442 ** is the number of leaf page pointers to follow. */ 006443 k = get4byte(&pTrunk->aData[4]); 006444 if( k==0 && !searchList ){ 006445 /* The trunk has no leaves and the list is not being searched. 006446 ** So extract the trunk page itself and use it as the newly 006447 ** allocated page */ 006448 assert( pPrevTrunk==0 ); 006449 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006450 if( rc ){ 006451 goto end_allocate_page; 006452 } 006453 *pPgno = iTrunk; 006454 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 006455 *ppPage = pTrunk; 006456 pTrunk = 0; 006457 TRACE(("ALLOCATE: %u trunk - %u free pages left\n", *pPgno, n-1)); 006458 }else if( k>(u32)(pBt->usableSize/4 - 2) ){ 006459 /* Value of k is out of range. Database corruption */ 006460 rc = SQLITE_CORRUPT_PGNO(iTrunk); 006461 goto end_allocate_page; 006462 #ifndef SQLITE_OMIT_AUTOVACUUM 006463 }else if( searchList 006464 && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE)) 006465 ){ 006466 /* The list is being searched and this trunk page is the page 006467 ** to allocate, regardless of whether it has leaves. 006468 */ 006469 *pPgno = iTrunk; 006470 *ppPage = pTrunk; 006471 searchList = 0; 006472 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006473 if( rc ){ 006474 goto end_allocate_page; 006475 } 006476 if( k==0 ){ 006477 if( !pPrevTrunk ){ 006478 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 006479 }else{ 006480 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 006481 if( rc!=SQLITE_OK ){ 006482 goto end_allocate_page; 006483 } 006484 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4); 006485 } 006486 }else{ 006487 /* The trunk page is required by the caller but it contains 006488 ** pointers to free-list leaves. The first leaf becomes a trunk 006489 ** page in this case. 006490 */ 006491 MemPage *pNewTrunk; 006492 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); 006493 if( iNewTrunk>mxPage ){ 006494 rc = SQLITE_CORRUPT_PGNO(iTrunk); 006495 goto end_allocate_page; 006496 } 006497 testcase( iNewTrunk==mxPage ); 006498 rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0); 006499 if( rc!=SQLITE_OK ){ 006500 goto end_allocate_page; 006501 } 006502 rc = sqlite3PagerWrite(pNewTrunk->pDbPage); 006503 if( rc!=SQLITE_OK ){ 006504 releasePage(pNewTrunk); 006505 goto end_allocate_page; 006506 } 006507 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4); 006508 put4byte(&pNewTrunk->aData[4], k-1); 006509 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4); 006510 releasePage(pNewTrunk); 006511 if( !pPrevTrunk ){ 006512 assert( sqlite3PagerIswriteable(pPage1->pDbPage) ); 006513 put4byte(&pPage1->aData[32], iNewTrunk); 006514 }else{ 006515 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 006516 if( rc ){ 006517 goto end_allocate_page; 006518 } 006519 put4byte(&pPrevTrunk->aData[0], iNewTrunk); 006520 } 006521 } 006522 pTrunk = 0; 006523 TRACE(("ALLOCATE: %u trunk - %u free pages left\n", *pPgno, n-1)); 006524 #endif 006525 }else if( k>0 ){ 006526 /* Extract a leaf from the trunk */ 006527 u32 closest; 006528 Pgno iPage; 006529 unsigned char *aData = pTrunk->aData; 006530 if( nearby>0 ){ 006531 u32 i; 006532 closest = 0; 006533 if( eMode==BTALLOC_LE ){ 006534 for(i=0; i<k; i++){ 006535 iPage = get4byte(&aData[8+i*4]); 006536 if( iPage<=nearby ){ 006537 closest = i; 006538 break; 006539 } 006540 } 006541 }else{ 006542 int dist; 006543 dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby); 006544 for(i=1; i<k; i++){ 006545 int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby); 006546 if( d2<dist ){ 006547 closest = i; 006548 dist = d2; 006549 } 006550 } 006551 } 006552 }else{ 006553 closest = 0; 006554 } 006555 006556 iPage = get4byte(&aData[8+closest*4]); 006557 testcase( iPage==mxPage ); 006558 if( iPage>mxPage || iPage<2 ){ 006559 rc = SQLITE_CORRUPT_PGNO(iTrunk); 006560 goto end_allocate_page; 006561 } 006562 testcase( iPage==mxPage ); 006563 if( !searchList 006564 || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE)) 006565 ){ 006566 int noContent; 006567 *pPgno = iPage; 006568 TRACE(("ALLOCATE: %u was leaf %u of %u on trunk %u" 006569 ": %u more free pages\n", 006570 *pPgno, closest+1, k, pTrunk->pgno, n-1)); 006571 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006572 if( rc ) goto end_allocate_page; 006573 if( closest<k-1 ){ 006574 memcpy(&aData[8+closest*4], &aData[4+k*4], 4); 006575 } 006576 put4byte(&aData[4], k-1); 006577 noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0; 006578 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent); 006579 if( rc==SQLITE_OK ){ 006580 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 006581 if( rc!=SQLITE_OK ){ 006582 releasePage(*ppPage); 006583 *ppPage = 0; 006584 } 006585 } 006586 searchList = 0; 006587 } 006588 } 006589 releasePage(pPrevTrunk); 006590 pPrevTrunk = 0; 006591 }while( searchList ); 006592 }else{ 006593 /* There are no pages on the freelist, so append a new page to the 006594 ** database image. 006595 ** 006596 ** Normally, new pages allocated by this block can be requested from the 006597 ** pager layer with the 'no-content' flag set. This prevents the pager 006598 ** from trying to read the pages content from disk. However, if the 006599 ** current transaction has already run one or more incremental-vacuum 006600 ** steps, then the page we are about to allocate may contain content 006601 ** that is required in the event of a rollback. In this case, do 006602 ** not set the no-content flag. This causes the pager to load and journal 006603 ** the current page content before overwriting it. 006604 ** 006605 ** Note that the pager will not actually attempt to load or journal 006606 ** content for any page that really does lie past the end of the database 006607 ** file on disk. So the effects of disabling the no-content optimization 006608 ** here are confined to those pages that lie between the end of the 006609 ** database image and the end of the database file. 006610 */ 006611 int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0; 006612 006613 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 006614 if( rc ) return rc; 006615 pBt->nPage++; 006616 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++; 006617 006618 #ifndef SQLITE_OMIT_AUTOVACUUM 006619 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){ 006620 /* If *pPgno refers to a pointer-map page, allocate two new pages 006621 ** at the end of the file instead of one. The first allocated page 006622 ** becomes a new pointer-map page, the second is used by the caller. 006623 */ 006624 MemPage *pPg = 0; 006625 TRACE(("ALLOCATE: %u from end of file (pointer-map page)\n", pBt->nPage)); 006626 assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) ); 006627 rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent); 006628 if( rc==SQLITE_OK ){ 006629 rc = sqlite3PagerWrite(pPg->pDbPage); 006630 releasePage(pPg); 006631 } 006632 if( rc ) return rc; 006633 pBt->nPage++; 006634 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; } 006635 } 006636 #endif 006637 put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage); 006638 *pPgno = pBt->nPage; 006639 006640 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 006641 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent); 006642 if( rc ) return rc; 006643 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 006644 if( rc!=SQLITE_OK ){ 006645 releasePage(*ppPage); 006646 *ppPage = 0; 006647 } 006648 TRACE(("ALLOCATE: %u from end of file\n", *pPgno)); 006649 } 006650 006651 assert( CORRUPT_DB || *pPgno!=PENDING_BYTE_PAGE(pBt) ); 006652 006653 end_allocate_page: 006654 releasePage(pTrunk); 006655 releasePage(pPrevTrunk); 006656 assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 ); 006657 assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 ); 006658 return rc; 006659 } 006660 006661 /* 006662 ** This function is used to add page iPage to the database file free-list. 006663 ** It is assumed that the page is not already a part of the free-list. 006664 ** 006665 ** The value passed as the second argument to this function is optional. 006666 ** If the caller happens to have a pointer to the MemPage object 006667 ** corresponding to page iPage handy, it may pass it as the second value. 006668 ** Otherwise, it may pass NULL. 006669 ** 006670 ** If a pointer to a MemPage object is passed as the second argument, 006671 ** its reference count is not altered by this function. 006672 */ 006673 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ 006674 MemPage *pTrunk = 0; /* Free-list trunk page */ 006675 Pgno iTrunk = 0; /* Page number of free-list trunk page */ 006676 MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */ 006677 MemPage *pPage; /* Page being freed. May be NULL. */ 006678 int rc; /* Return Code */ 006679 u32 nFree; /* Initial number of pages on free-list */ 006680 006681 assert( sqlite3_mutex_held(pBt->mutex) ); 006682 assert( CORRUPT_DB || iPage>1 ); 006683 assert( !pMemPage || pMemPage->pgno==iPage ); 006684 006685 if( iPage<2 || iPage>pBt->nPage ){ 006686 return SQLITE_CORRUPT_BKPT; 006687 } 006688 if( pMemPage ){ 006689 pPage = pMemPage; 006690 sqlite3PagerRef(pPage->pDbPage); 006691 }else{ 006692 pPage = btreePageLookup(pBt, iPage); 006693 } 006694 006695 /* Increment the free page count on pPage1 */ 006696 rc = sqlite3PagerWrite(pPage1->pDbPage); 006697 if( rc ) goto freepage_out; 006698 nFree = get4byte(&pPage1->aData[36]); 006699 put4byte(&pPage1->aData[36], nFree+1); 006700 006701 if( pBt->btsFlags & BTS_SECURE_DELETE ){ 006702 /* If the secure_delete option is enabled, then 006703 ** always fully overwrite deleted information with zeros. 006704 */ 006705 if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) ) 006706 || ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0) 006707 ){ 006708 goto freepage_out; 006709 } 006710 memset(pPage->aData, 0, pPage->pBt->pageSize); 006711 } 006712 006713 /* If the database supports auto-vacuum, write an entry in the pointer-map 006714 ** to indicate that the page is free. 006715 */ 006716 if( ISAUTOVACUUM(pBt) ){ 006717 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc); 006718 if( rc ) goto freepage_out; 006719 } 006720 006721 /* Now manipulate the actual database free-list structure. There are two 006722 ** possibilities. If the free-list is currently empty, or if the first 006723 ** trunk page in the free-list is full, then this page will become a 006724 ** new free-list trunk page. Otherwise, it will become a leaf of the 006725 ** first trunk page in the current free-list. This block tests if it 006726 ** is possible to add the page as a new free-list leaf. 006727 */ 006728 if( nFree!=0 ){ 006729 u32 nLeaf; /* Initial number of leaf cells on trunk page */ 006730 006731 iTrunk = get4byte(&pPage1->aData[32]); 006732 if( iTrunk>btreePagecount(pBt) ){ 006733 rc = SQLITE_CORRUPT_BKPT; 006734 goto freepage_out; 006735 } 006736 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); 006737 if( rc!=SQLITE_OK ){ 006738 goto freepage_out; 006739 } 006740 006741 nLeaf = get4byte(&pTrunk->aData[4]); 006742 assert( pBt->usableSize>32 ); 006743 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){ 006744 rc = SQLITE_CORRUPT_BKPT; 006745 goto freepage_out; 006746 } 006747 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){ 006748 /* In this case there is room on the trunk page to insert the page 006749 ** being freed as a new leaf. 006750 ** 006751 ** Note that the trunk page is not really full until it contains 006752 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have 006753 ** coded. But due to a coding error in versions of SQLite prior to 006754 ** 3.6.0, databases with freelist trunk pages holding more than 006755 ** usableSize/4 - 8 entries will be reported as corrupt. In order 006756 ** to maintain backwards compatibility with older versions of SQLite, 006757 ** we will continue to restrict the number of entries to usableSize/4 - 8 006758 ** for now. At some point in the future (once everyone has upgraded 006759 ** to 3.6.0 or later) we should consider fixing the conditional above 006760 ** to read "usableSize/4-2" instead of "usableSize/4-8". 006761 ** 006762 ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still 006763 ** avoid using the last six entries in the freelist trunk page array in 006764 ** order that database files created by newer versions of SQLite can be 006765 ** read by older versions of SQLite. 006766 */ 006767 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006768 if( rc==SQLITE_OK ){ 006769 put4byte(&pTrunk->aData[4], nLeaf+1); 006770 put4byte(&pTrunk->aData[8+nLeaf*4], iPage); 006771 if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){ 006772 sqlite3PagerDontWrite(pPage->pDbPage); 006773 } 006774 rc = btreeSetHasContent(pBt, iPage); 006775 } 006776 TRACE(("FREE-PAGE: %u leaf on trunk page %u\n",pPage->pgno,pTrunk->pgno)); 006777 goto freepage_out; 006778 } 006779 } 006780 006781 /* If control flows to this point, then it was not possible to add the 006782 ** the page being freed as a leaf page of the first trunk in the free-list. 006783 ** Possibly because the free-list is empty, or possibly because the 006784 ** first trunk in the free-list is full. Either way, the page being freed 006785 ** will become the new first trunk page in the free-list. 006786 */ 006787 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){ 006788 goto freepage_out; 006789 } 006790 rc = sqlite3PagerWrite(pPage->pDbPage); 006791 if( rc!=SQLITE_OK ){ 006792 goto freepage_out; 006793 } 006794 put4byte(pPage->aData, iTrunk); 006795 put4byte(&pPage->aData[4], 0); 006796 put4byte(&pPage1->aData[32], iPage); 006797 TRACE(("FREE-PAGE: %u new trunk page replacing %u\n", pPage->pgno, iTrunk)); 006798 006799 freepage_out: 006800 if( pPage ){ 006801 pPage->isInit = 0; 006802 } 006803 releasePage(pPage); 006804 releasePage(pTrunk); 006805 return rc; 006806 } 006807 static void freePage(MemPage *pPage, int *pRC){ 006808 if( (*pRC)==SQLITE_OK ){ 006809 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno); 006810 } 006811 } 006812 006813 /* 006814 ** Free the overflow pages associated with the given Cell. 006815 */ 006816 static SQLITE_NOINLINE int clearCellOverflow( 006817 MemPage *pPage, /* The page that contains the Cell */ 006818 unsigned char *pCell, /* First byte of the Cell */ 006819 CellInfo *pInfo /* Size information about the cell */ 006820 ){ 006821 BtShared *pBt; 006822 Pgno ovflPgno; 006823 int rc; 006824 int nOvfl; 006825 u32 ovflPageSize; 006826 006827 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 006828 assert( pInfo->nLocal!=pInfo->nPayload ); 006829 testcase( pCell + pInfo->nSize == pPage->aDataEnd ); 006830 testcase( pCell + (pInfo->nSize-1) == pPage->aDataEnd ); 006831 if( pCell + pInfo->nSize > pPage->aDataEnd ){ 006832 /* Cell extends past end of page */ 006833 return SQLITE_CORRUPT_PAGE(pPage); 006834 } 006835 ovflPgno = get4byte(pCell + pInfo->nSize - 4); 006836 pBt = pPage->pBt; 006837 assert( pBt->usableSize > 4 ); 006838 ovflPageSize = pBt->usableSize - 4; 006839 nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize; 006840 assert( nOvfl>0 || 006841 (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize) 006842 ); 006843 while( nOvfl-- ){ 006844 Pgno iNext = 0; 006845 MemPage *pOvfl = 0; 006846 if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){ 006847 /* 0 is not a legal page number and page 1 cannot be an 006848 ** overflow page. Therefore if ovflPgno<2 or past the end of the 006849 ** file the database must be corrupt. */ 006850 return SQLITE_CORRUPT_BKPT; 006851 } 006852 if( nOvfl ){ 006853 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext); 006854 if( rc ) return rc; 006855 } 006856 006857 if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) ) 006858 && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1 006859 ){ 006860 /* There is no reason any cursor should have an outstanding reference 006861 ** to an overflow page belonging to a cell that is being deleted/updated. 006862 ** So if there exists more than one reference to this page, then it 006863 ** must not really be an overflow page and the database must be corrupt. 006864 ** It is helpful to detect this before calling freePage2(), as 006865 ** freePage2() may zero the page contents if secure-delete mode is 006866 ** enabled. If this 'overflow' page happens to be a page that the 006867 ** caller is iterating through or using in some other way, this 006868 ** can be problematic. 006869 */ 006870 rc = SQLITE_CORRUPT_BKPT; 006871 }else{ 006872 rc = freePage2(pBt, pOvfl, ovflPgno); 006873 } 006874 006875 if( pOvfl ){ 006876 sqlite3PagerUnref(pOvfl->pDbPage); 006877 } 006878 if( rc ) return rc; 006879 ovflPgno = iNext; 006880 } 006881 return SQLITE_OK; 006882 } 006883 006884 /* Call xParseCell to compute the size of a cell. If the cell contains 006885 ** overflow, then invoke cellClearOverflow to clear out that overflow. 006886 ** Store the result code (SQLITE_OK or some error code) in rc. 006887 ** 006888 ** Implemented as macro to force inlining for performance. 006889 */ 006890 #define BTREE_CLEAR_CELL(rc, pPage, pCell, sInfo) \ 006891 pPage->xParseCell(pPage, pCell, &sInfo); \ 006892 if( sInfo.nLocal!=sInfo.nPayload ){ \ 006893 rc = clearCellOverflow(pPage, pCell, &sInfo); \ 006894 }else{ \ 006895 rc = SQLITE_OK; \ 006896 } 006897 006898 006899 /* 006900 ** Create the byte sequence used to represent a cell on page pPage 006901 ** and write that byte sequence into pCell[]. Overflow pages are 006902 ** allocated and filled in as necessary. The calling procedure 006903 ** is responsible for making sure sufficient space has been allocated 006904 ** for pCell[]. 006905 ** 006906 ** Note that pCell does not necessary need to point to the pPage->aData 006907 ** area. pCell might point to some temporary storage. The cell will 006908 ** be constructed in this temporary area then copied into pPage->aData 006909 ** later. 006910 */ 006911 static int fillInCell( 006912 MemPage *pPage, /* The page that contains the cell */ 006913 unsigned char *pCell, /* Complete text of the cell */ 006914 const BtreePayload *pX, /* Payload with which to construct the cell */ 006915 int *pnSize /* Write cell size here */ 006916 ){ 006917 int nPayload; 006918 const u8 *pSrc; 006919 int nSrc, n, rc, mn; 006920 int spaceLeft; 006921 MemPage *pToRelease; 006922 unsigned char *pPrior; 006923 unsigned char *pPayload; 006924 BtShared *pBt; 006925 Pgno pgnoOvfl; 006926 int nHeader; 006927 006928 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 006929 006930 /* pPage is not necessarily writeable since pCell might be auxiliary 006931 ** buffer space that is separate from the pPage buffer area */ 006932 assert( pCell<pPage->aData || pCell>=&pPage->aData[pPage->pBt->pageSize] 006933 || sqlite3PagerIswriteable(pPage->pDbPage) ); 006934 006935 /* Fill in the header. */ 006936 nHeader = pPage->childPtrSize; 006937 if( pPage->intKey ){ 006938 nPayload = pX->nData + pX->nZero; 006939 pSrc = pX->pData; 006940 nSrc = pX->nData; 006941 assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */ 006942 nHeader += putVarint32(&pCell[nHeader], nPayload); 006943 nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey); 006944 }else{ 006945 assert( pX->nKey<=0x7fffffff && pX->pKey!=0 ); 006946 nSrc = nPayload = (int)pX->nKey; 006947 pSrc = pX->pKey; 006948 nHeader += putVarint32(&pCell[nHeader], nPayload); 006949 } 006950 006951 /* Fill in the payload */ 006952 pPayload = &pCell[nHeader]; 006953 if( nPayload<=pPage->maxLocal ){ 006954 /* This is the common case where everything fits on the btree page 006955 ** and no overflow pages are required. */ 006956 n = nHeader + nPayload; 006957 testcase( n==3 ); 006958 testcase( n==4 ); 006959 if( n<4 ){ 006960 n = 4; 006961 pPayload[nPayload] = 0; 006962 } 006963 *pnSize = n; 006964 assert( nSrc<=nPayload ); 006965 testcase( nSrc<nPayload ); 006966 memcpy(pPayload, pSrc, nSrc); 006967 memset(pPayload+nSrc, 0, nPayload-nSrc); 006968 return SQLITE_OK; 006969 } 006970 006971 /* If we reach this point, it means that some of the content will need 006972 ** to spill onto overflow pages. 006973 */ 006974 mn = pPage->minLocal; 006975 n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4); 006976 testcase( n==pPage->maxLocal ); 006977 testcase( n==pPage->maxLocal+1 ); 006978 if( n > pPage->maxLocal ) n = mn; 006979 spaceLeft = n; 006980 *pnSize = n + nHeader + 4; 006981 pPrior = &pCell[nHeader+n]; 006982 pToRelease = 0; 006983 pgnoOvfl = 0; 006984 pBt = pPage->pBt; 006985 006986 /* At this point variables should be set as follows: 006987 ** 006988 ** nPayload Total payload size in bytes 006989 ** pPayload Begin writing payload here 006990 ** spaceLeft Space available at pPayload. If nPayload>spaceLeft, 006991 ** that means content must spill into overflow pages. 006992 ** *pnSize Size of the local cell (not counting overflow pages) 006993 ** pPrior Where to write the pgno of the first overflow page 006994 ** 006995 ** Use a call to btreeParseCellPtr() to verify that the values above 006996 ** were computed correctly. 006997 */ 006998 #ifdef SQLITE_DEBUG 006999 { 007000 CellInfo info; 007001 pPage->xParseCell(pPage, pCell, &info); 007002 assert( nHeader==(int)(info.pPayload - pCell) ); 007003 assert( info.nKey==pX->nKey ); 007004 assert( *pnSize == info.nSize ); 007005 assert( spaceLeft == info.nLocal ); 007006 } 007007 #endif 007008 007009 /* Write the payload into the local Cell and any extra into overflow pages */ 007010 while( 1 ){ 007011 n = nPayload; 007012 if( n>spaceLeft ) n = spaceLeft; 007013 007014 /* If pToRelease is not zero than pPayload points into the data area 007015 ** of pToRelease. Make sure pToRelease is still writeable. */ 007016 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 007017 007018 /* If pPayload is part of the data area of pPage, then make sure pPage 007019 ** is still writeable */ 007020 assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize] 007021 || sqlite3PagerIswriteable(pPage->pDbPage) ); 007022 007023 if( nSrc>=n ){ 007024 memcpy(pPayload, pSrc, n); 007025 }else if( nSrc>0 ){ 007026 n = nSrc; 007027 memcpy(pPayload, pSrc, n); 007028 }else{ 007029 memset(pPayload, 0, n); 007030 } 007031 nPayload -= n; 007032 if( nPayload<=0 ) break; 007033 pPayload += n; 007034 pSrc += n; 007035 nSrc -= n; 007036 spaceLeft -= n; 007037 if( spaceLeft==0 ){ 007038 MemPage *pOvfl = 0; 007039 #ifndef SQLITE_OMIT_AUTOVACUUM 007040 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */ 007041 if( pBt->autoVacuum ){ 007042 do{ 007043 pgnoOvfl++; 007044 } while( 007045 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 007046 ); 007047 } 007048 #endif 007049 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0); 007050 #ifndef SQLITE_OMIT_AUTOVACUUM 007051 /* If the database supports auto-vacuum, and the second or subsequent 007052 ** overflow page is being allocated, add an entry to the pointer-map 007053 ** for that page now. 007054 ** 007055 ** If this is the first overflow page, then write a partial entry 007056 ** to the pointer-map. If we write nothing to this pointer-map slot, 007057 ** then the optimistic overflow chain processing in clearCell() 007058 ** may misinterpret the uninitialized values and delete the 007059 ** wrong pages from the database. 007060 */ 007061 if( pBt->autoVacuum && rc==SQLITE_OK ){ 007062 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1); 007063 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc); 007064 if( rc ){ 007065 releasePage(pOvfl); 007066 } 007067 } 007068 #endif 007069 if( rc ){ 007070 releasePage(pToRelease); 007071 return rc; 007072 } 007073 007074 /* If pToRelease is not zero than pPrior points into the data area 007075 ** of pToRelease. Make sure pToRelease is still writeable. */ 007076 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 007077 007078 /* If pPrior is part of the data area of pPage, then make sure pPage 007079 ** is still writeable */ 007080 assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize] 007081 || sqlite3PagerIswriteable(pPage->pDbPage) ); 007082 007083 put4byte(pPrior, pgnoOvfl); 007084 releasePage(pToRelease); 007085 pToRelease = pOvfl; 007086 pPrior = pOvfl->aData; 007087 put4byte(pPrior, 0); 007088 pPayload = &pOvfl->aData[4]; 007089 spaceLeft = pBt->usableSize - 4; 007090 } 007091 } 007092 releasePage(pToRelease); 007093 return SQLITE_OK; 007094 } 007095 007096 /* 007097 ** Remove the i-th cell from pPage. This routine effects pPage only. 007098 ** The cell content is not freed or deallocated. It is assumed that 007099 ** the cell content has been copied someplace else. This routine just 007100 ** removes the reference to the cell from pPage. 007101 ** 007102 ** "sz" must be the number of bytes in the cell. 007103 */ 007104 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ 007105 u32 pc; /* Offset to cell content of cell being deleted */ 007106 u8 *data; /* pPage->aData */ 007107 u8 *ptr; /* Used to move bytes around within data[] */ 007108 int rc; /* The return code */ 007109 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */ 007110 007111 if( *pRC ) return; 007112 assert( idx>=0 ); 007113 assert( idx<pPage->nCell ); 007114 assert( CORRUPT_DB || sz==cellSize(pPage, idx) ); 007115 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 007116 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007117 assert( pPage->nFree>=0 ); 007118 data = pPage->aData; 007119 ptr = &pPage->aCellIdx[2*idx]; 007120 assert( pPage->pBt->usableSize > (u32)(ptr-data) ); 007121 pc = get2byte(ptr); 007122 hdr = pPage->hdrOffset; 007123 testcase( pc==(u32)get2byte(&data[hdr+5]) ); 007124 testcase( pc+sz==pPage->pBt->usableSize ); 007125 if( pc+sz > pPage->pBt->usableSize ){ 007126 *pRC = SQLITE_CORRUPT_BKPT; 007127 return; 007128 } 007129 rc = freeSpace(pPage, pc, sz); 007130 if( rc ){ 007131 *pRC = rc; 007132 return; 007133 } 007134 pPage->nCell--; 007135 if( pPage->nCell==0 ){ 007136 memset(&data[hdr+1], 0, 4); 007137 data[hdr+7] = 0; 007138 put2byte(&data[hdr+5], pPage->pBt->usableSize); 007139 pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset 007140 - pPage->childPtrSize - 8; 007141 }else{ 007142 memmove(ptr, ptr+2, 2*(pPage->nCell - idx)); 007143 put2byte(&data[hdr+3], pPage->nCell); 007144 pPage->nFree += 2; 007145 } 007146 } 007147 007148 /* 007149 ** Insert a new cell on pPage at cell index "i". pCell points to the 007150 ** content of the cell. 007151 ** 007152 ** If the cell content will fit on the page, then put it there. If it 007153 ** will not fit, then make a copy of the cell content into pTemp if 007154 ** pTemp is not null. Regardless of pTemp, allocate a new entry 007155 ** in pPage->apOvfl[] and make it point to the cell content (either 007156 ** in pTemp or the original pCell) and also record its index. 007157 ** Allocating a new entry in pPage->aCell[] implies that 007158 ** pPage->nOverflow is incremented. 007159 ** 007160 ** The insertCellFast() routine below works exactly the same as 007161 ** insertCell() except that it lacks the pTemp and iChild parameters 007162 ** which are assumed zero. Other than that, the two routines are the 007163 ** same. 007164 ** 007165 ** Fixes or enhancements to this routine should be reflected in 007166 ** insertCellFast()! 007167 */ 007168 static int insertCell( 007169 MemPage *pPage, /* Page into which we are copying */ 007170 int i, /* New cell becomes the i-th cell of the page */ 007171 u8 *pCell, /* Content of the new cell */ 007172 int sz, /* Bytes of content in pCell */ 007173 u8 *pTemp, /* Temp storage space for pCell, if needed */ 007174 Pgno iChild /* If non-zero, replace first 4 bytes with this value */ 007175 ){ 007176 int idx = 0; /* Where to write new cell content in data[] */ 007177 int j; /* Loop counter */ 007178 u8 *data; /* The content of the whole page */ 007179 u8 *pIns; /* The point in pPage->aCellIdx[] where no cell inserted */ 007180 007181 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 007182 assert( MX_CELL(pPage->pBt)<=10921 ); 007183 assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB ); 007184 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) ); 007185 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) ); 007186 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007187 assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB ); 007188 assert( pPage->nFree>=0 ); 007189 assert( iChild>0 ); 007190 if( pPage->nOverflow || sz+2>pPage->nFree ){ 007191 if( pTemp ){ 007192 memcpy(pTemp, pCell, sz); 007193 pCell = pTemp; 007194 } 007195 put4byte(pCell, iChild); 007196 j = pPage->nOverflow++; 007197 /* Comparison against ArraySize-1 since we hold back one extra slot 007198 ** as a contingency. In other words, never need more than 3 overflow 007199 ** slots but 4 are allocated, just to be safe. */ 007200 assert( j < ArraySize(pPage->apOvfl)-1 ); 007201 pPage->apOvfl[j] = pCell; 007202 pPage->aiOvfl[j] = (u16)i; 007203 007204 /* When multiple overflows occur, they are always sequential and in 007205 ** sorted order. This invariants arise because multiple overflows can 007206 ** only occur when inserting divider cells into the parent page during 007207 ** balancing, and the dividers are adjacent and sorted. 007208 */ 007209 assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */ 007210 assert( j==0 || i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */ 007211 }else{ 007212 int rc = sqlite3PagerWrite(pPage->pDbPage); 007213 if( NEVER(rc!=SQLITE_OK) ){ 007214 return rc; 007215 } 007216 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 007217 data = pPage->aData; 007218 assert( &data[pPage->cellOffset]==pPage->aCellIdx ); 007219 rc = allocateSpace(pPage, sz, &idx); 007220 if( rc ){ return rc; } 007221 /* The allocateSpace() routine guarantees the following properties 007222 ** if it returns successfully */ 007223 assert( idx >= 0 ); 007224 assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB ); 007225 assert( idx+sz <= (int)pPage->pBt->usableSize ); 007226 pPage->nFree -= (u16)(2 + sz); 007227 /* In a corrupt database where an entry in the cell index section of 007228 ** a btree page has a value of 3 or less, the pCell value might point 007229 ** as many as 4 bytes in front of the start of the aData buffer for 007230 ** the source page. Make sure this does not cause problems by not 007231 ** reading the first 4 bytes */ 007232 memcpy(&data[idx+4], pCell+4, sz-4); 007233 put4byte(&data[idx], iChild); 007234 pIns = pPage->aCellIdx + i*2; 007235 memmove(pIns+2, pIns, 2*(pPage->nCell - i)); 007236 put2byte(pIns, idx); 007237 pPage->nCell++; 007238 /* increment the cell count */ 007239 if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++; 007240 assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB ); 007241 #ifndef SQLITE_OMIT_AUTOVACUUM 007242 if( pPage->pBt->autoVacuum ){ 007243 int rc2 = SQLITE_OK; 007244 /* The cell may contain a pointer to an overflow page. If so, write 007245 ** the entry for the overflow page into the pointer map. 007246 */ 007247 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc2); 007248 if( rc2 ) return rc2; 007249 } 007250 #endif 007251 } 007252 return SQLITE_OK; 007253 } 007254 007255 /* 007256 ** This variant of insertCell() assumes that the pTemp and iChild 007257 ** parameters are both zero. Use this variant in sqlite3BtreeInsert() 007258 ** for performance improvement, and also so that this variant is only 007259 ** called from that one place, and is thus inlined, and thus runs must 007260 ** faster. 007261 ** 007262 ** Fixes or enhancements to this routine should be reflected into 007263 ** the insertCell() routine. 007264 */ 007265 static int insertCellFast( 007266 MemPage *pPage, /* Page into which we are copying */ 007267 int i, /* New cell becomes the i-th cell of the page */ 007268 u8 *pCell, /* Content of the new cell */ 007269 int sz /* Bytes of content in pCell */ 007270 ){ 007271 int idx = 0; /* Where to write new cell content in data[] */ 007272 int j; /* Loop counter */ 007273 u8 *data; /* The content of the whole page */ 007274 u8 *pIns; /* The point in pPage->aCellIdx[] where no cell inserted */ 007275 007276 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 007277 assert( MX_CELL(pPage->pBt)<=10921 ); 007278 assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB ); 007279 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) ); 007280 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) ); 007281 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007282 assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB ); 007283 assert( pPage->nFree>=0 ); 007284 assert( pPage->nOverflow==0 ); 007285 if( sz+2>pPage->nFree ){ 007286 j = pPage->nOverflow++; 007287 /* Comparison against ArraySize-1 since we hold back one extra slot 007288 ** as a contingency. In other words, never need more than 3 overflow 007289 ** slots but 4 are allocated, just to be safe. */ 007290 assert( j < ArraySize(pPage->apOvfl)-1 ); 007291 pPage->apOvfl[j] = pCell; 007292 pPage->aiOvfl[j] = (u16)i; 007293 007294 /* When multiple overflows occur, they are always sequential and in 007295 ** sorted order. This invariants arise because multiple overflows can 007296 ** only occur when inserting divider cells into the parent page during 007297 ** balancing, and the dividers are adjacent and sorted. 007298 */ 007299 assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */ 007300 assert( j==0 || i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */ 007301 }else{ 007302 int rc = sqlite3PagerWrite(pPage->pDbPage); 007303 if( rc!=SQLITE_OK ){ 007304 return rc; 007305 } 007306 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 007307 data = pPage->aData; 007308 assert( &data[pPage->cellOffset]==pPage->aCellIdx ); 007309 rc = allocateSpace(pPage, sz, &idx); 007310 if( rc ){ return rc; } 007311 /* The allocateSpace() routine guarantees the following properties 007312 ** if it returns successfully */ 007313 assert( idx >= 0 ); 007314 assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB ); 007315 assert( idx+sz <= (int)pPage->pBt->usableSize ); 007316 pPage->nFree -= (u16)(2 + sz); 007317 memcpy(&data[idx], pCell, sz); 007318 pIns = pPage->aCellIdx + i*2; 007319 memmove(pIns+2, pIns, 2*(pPage->nCell - i)); 007320 put2byte(pIns, idx); 007321 pPage->nCell++; 007322 /* increment the cell count */ 007323 if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++; 007324 assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB ); 007325 #ifndef SQLITE_OMIT_AUTOVACUUM 007326 if( pPage->pBt->autoVacuum ){ 007327 int rc2 = SQLITE_OK; 007328 /* The cell may contain a pointer to an overflow page. If so, write 007329 ** the entry for the overflow page into the pointer map. 007330 */ 007331 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc2); 007332 if( rc2 ) return rc2; 007333 } 007334 #endif 007335 } 007336 return SQLITE_OK; 007337 } 007338 007339 /* 007340 ** The following parameters determine how many adjacent pages get involved 007341 ** in a balancing operation. NN is the number of neighbors on either side 007342 ** of the page that participate in the balancing operation. NB is the 007343 ** total number of pages that participate, including the target page and 007344 ** NN neighbors on either side. 007345 ** 007346 ** The minimum value of NN is 1 (of course). Increasing NN above 1 007347 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance 007348 ** in exchange for a larger degradation in INSERT and UPDATE performance. 007349 ** The value of NN appears to give the best results overall. 007350 ** 007351 ** (Later:) The description above makes it seem as if these values are 007352 ** tunable - as if you could change them and recompile and it would all work. 007353 ** But that is unlikely. NB has been 3 since the inception of SQLite and 007354 ** we have never tested any other value. 007355 */ 007356 #define NN 1 /* Number of neighbors on either side of pPage */ 007357 #define NB 3 /* (NN*2+1): Total pages involved in the balance */ 007358 007359 /* 007360 ** A CellArray object contains a cache of pointers and sizes for a 007361 ** consecutive sequence of cells that might be held on multiple pages. 007362 ** 007363 ** The cells in this array are the divider cell or cells from the pParent 007364 ** page plus up to three child pages. There are a total of nCell cells. 007365 ** 007366 ** pRef is a pointer to one of the pages that contributes cells. This is 007367 ** used to access information such as MemPage.intKey and MemPage.pBt->pageSize 007368 ** which should be common to all pages that contribute cells to this array. 007369 ** 007370 ** apCell[] and szCell[] hold, respectively, pointers to the start of each 007371 ** cell and the size of each cell. Some of the apCell[] pointers might refer 007372 ** to overflow cells. In other words, some apCel[] pointers might not point 007373 ** to content area of the pages. 007374 ** 007375 ** A szCell[] of zero means the size of that cell has not yet been computed. 007376 ** 007377 ** The cells come from as many as four different pages: 007378 ** 007379 ** ----------- 007380 ** | Parent | 007381 ** ----------- 007382 ** / | \ 007383 ** / | \ 007384 ** --------- --------- --------- 007385 ** |Child-1| |Child-2| |Child-3| 007386 ** --------- --------- --------- 007387 ** 007388 ** The order of cells is in the array is for an index btree is: 007389 ** 007390 ** 1. All cells from Child-1 in order 007391 ** 2. The first divider cell from Parent 007392 ** 3. All cells from Child-2 in order 007393 ** 4. The second divider cell from Parent 007394 ** 5. All cells from Child-3 in order 007395 ** 007396 ** For a table-btree (with rowids) the items 2 and 4 are empty because 007397 ** content exists only in leaves and there are no divider cells. 007398 ** 007399 ** For an index btree, the apEnd[] array holds pointer to the end of page 007400 ** for Child-1, the Parent, Child-2, the Parent (again), and Child-3, 007401 ** respectively. The ixNx[] array holds the number of cells contained in 007402 ** each of these 5 stages, and all stages to the left. Hence: 007403 ** 007404 ** ixNx[0] = Number of cells in Child-1. 007405 ** ixNx[1] = Number of cells in Child-1 plus 1 for first divider. 007406 ** ixNx[2] = Number of cells in Child-1 and Child-2 + 1 for 1st divider. 007407 ** ixNx[3] = Number of cells in Child-1 and Child-2 + both divider cells 007408 ** ixNx[4] = Total number of cells. 007409 ** 007410 ** For a table-btree, the concept is similar, except only apEnd[0]..apEnd[2] 007411 ** are used and they point to the leaf pages only, and the ixNx value are: 007412 ** 007413 ** ixNx[0] = Number of cells in Child-1. 007414 ** ixNx[1] = Number of cells in Child-1 and Child-2. 007415 ** ixNx[2] = Total number of cells. 007416 ** 007417 ** Sometimes when deleting, a child page can have zero cells. In those 007418 ** cases, ixNx[] entries with higher indexes, and the corresponding apEnd[] 007419 ** entries, shift down. The end result is that each ixNx[] entry should 007420 ** be larger than the previous 007421 */ 007422 typedef struct CellArray CellArray; 007423 struct CellArray { 007424 int nCell; /* Number of cells in apCell[] */ 007425 MemPage *pRef; /* Reference page */ 007426 u8 **apCell; /* All cells begin balanced */ 007427 u16 *szCell; /* Local size of all cells in apCell[] */ 007428 u8 *apEnd[NB*2]; /* MemPage.aDataEnd values */ 007429 int ixNx[NB*2]; /* Index of at which we move to the next apEnd[] */ 007430 }; 007431 007432 /* 007433 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been 007434 ** computed. 007435 */ 007436 static void populateCellCache(CellArray *p, int idx, int N){ 007437 MemPage *pRef = p->pRef; 007438 u16 *szCell = p->szCell; 007439 assert( idx>=0 && idx+N<=p->nCell ); 007440 while( N>0 ){ 007441 assert( p->apCell[idx]!=0 ); 007442 if( szCell[idx]==0 ){ 007443 szCell[idx] = pRef->xCellSize(pRef, p->apCell[idx]); 007444 }else{ 007445 assert( CORRUPT_DB || 007446 szCell[idx]==pRef->xCellSize(pRef, p->apCell[idx]) ); 007447 } 007448 idx++; 007449 N--; 007450 } 007451 } 007452 007453 /* 007454 ** Return the size of the Nth element of the cell array 007455 */ 007456 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){ 007457 assert( N>=0 && N<p->nCell ); 007458 assert( p->szCell[N]==0 ); 007459 p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]); 007460 return p->szCell[N]; 007461 } 007462 static u16 cachedCellSize(CellArray *p, int N){ 007463 assert( N>=0 && N<p->nCell ); 007464 if( p->szCell[N] ) return p->szCell[N]; 007465 return computeCellSize(p, N); 007466 } 007467 007468 /* 007469 ** Array apCell[] contains pointers to nCell b-tree page cells. The 007470 ** szCell[] array contains the size in bytes of each cell. This function 007471 ** replaces the current contents of page pPg with the contents of the cell 007472 ** array. 007473 ** 007474 ** Some of the cells in apCell[] may currently be stored in pPg. This 007475 ** function works around problems caused by this by making a copy of any 007476 ** such cells before overwriting the page data. 007477 ** 007478 ** The MemPage.nFree field is invalidated by this function. It is the 007479 ** responsibility of the caller to set it correctly. 007480 */ 007481 static int rebuildPage( 007482 CellArray *pCArray, /* Content to be added to page pPg */ 007483 int iFirst, /* First cell in pCArray to use */ 007484 int nCell, /* Final number of cells on page */ 007485 MemPage *pPg /* The page to be reconstructed */ 007486 ){ 007487 const int hdr = pPg->hdrOffset; /* Offset of header on pPg */ 007488 u8 * const aData = pPg->aData; /* Pointer to data for pPg */ 007489 const int usableSize = pPg->pBt->usableSize; 007490 u8 * const pEnd = &aData[usableSize]; 007491 int i = iFirst; /* Which cell to copy from pCArray*/ 007492 u32 j; /* Start of cell content area */ 007493 int iEnd = i+nCell; /* Loop terminator */ 007494 u8 *pCellptr = pPg->aCellIdx; 007495 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); 007496 u8 *pData; 007497 int k; /* Current slot in pCArray->apEnd[] */ 007498 u8 *pSrcEnd; /* Current pCArray->apEnd[k] value */ 007499 007500 assert( nCell>0 ); 007501 assert( i<iEnd ); 007502 j = get2byte(&aData[hdr+5]); 007503 if( j>(u32)usableSize ){ j = 0; } 007504 memcpy(&pTmp[j], &aData[j], usableSize - j); 007505 007506 for(k=0; ALWAYS(k<NB*2) && pCArray->ixNx[k]<=i; k++){} 007507 pSrcEnd = pCArray->apEnd[k]; 007508 007509 pData = pEnd; 007510 while( 1/*exit by break*/ ){ 007511 u8 *pCell = pCArray->apCell[i]; 007512 u16 sz = pCArray->szCell[i]; 007513 assert( sz>0 ); 007514 if( SQLITE_WITHIN(pCell,aData+j,pEnd) ){ 007515 if( ((uptr)(pCell+sz))>(uptr)pEnd ) return SQLITE_CORRUPT_BKPT; 007516 pCell = &pTmp[pCell - aData]; 007517 }else if( (uptr)(pCell+sz)>(uptr)pSrcEnd 007518 && (uptr)(pCell)<(uptr)pSrcEnd 007519 ){ 007520 return SQLITE_CORRUPT_BKPT; 007521 } 007522 007523 pData -= sz; 007524 put2byte(pCellptr, (pData - aData)); 007525 pCellptr += 2; 007526 if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT; 007527 memmove(pData, pCell, sz); 007528 assert( sz==pPg->xCellSize(pPg, pCell) || CORRUPT_DB ); 007529 i++; 007530 if( i>=iEnd ) break; 007531 if( pCArray->ixNx[k]<=i ){ 007532 k++; 007533 pSrcEnd = pCArray->apEnd[k]; 007534 } 007535 } 007536 007537 /* The pPg->nFree field is now set incorrectly. The caller will fix it. */ 007538 pPg->nCell = nCell; 007539 pPg->nOverflow = 0; 007540 007541 put2byte(&aData[hdr+1], 0); 007542 put2byte(&aData[hdr+3], pPg->nCell); 007543 put2byte(&aData[hdr+5], pData - aData); 007544 aData[hdr+7] = 0x00; 007545 return SQLITE_OK; 007546 } 007547 007548 /* 007549 ** The pCArray objects contains pointers to b-tree cells and the cell sizes. 007550 ** This function attempts to add the cells stored in the array to page pPg. 007551 ** If it cannot (because the page needs to be defragmented before the cells 007552 ** will fit), non-zero is returned. Otherwise, if the cells are added 007553 ** successfully, zero is returned. 007554 ** 007555 ** Argument pCellptr points to the first entry in the cell-pointer array 007556 ** (part of page pPg) to populate. After cell apCell[0] is written to the 007557 ** page body, a 16-bit offset is written to pCellptr. And so on, for each 007558 ** cell in the array. It is the responsibility of the caller to ensure 007559 ** that it is safe to overwrite this part of the cell-pointer array. 007560 ** 007561 ** When this function is called, *ppData points to the start of the 007562 ** content area on page pPg. If the size of the content area is extended, 007563 ** *ppData is updated to point to the new start of the content area 007564 ** before returning. 007565 ** 007566 ** Finally, argument pBegin points to the byte immediately following the 007567 ** end of the space required by this page for the cell-pointer area (for 007568 ** all cells - not just those inserted by the current call). If the content 007569 ** area must be extended to before this point in order to accommodate all 007570 ** cells in apCell[], then the cells do not fit and non-zero is returned. 007571 */ 007572 static int pageInsertArray( 007573 MemPage *pPg, /* Page to add cells to */ 007574 u8 *pBegin, /* End of cell-pointer array */ 007575 u8 **ppData, /* IN/OUT: Page content-area pointer */ 007576 u8 *pCellptr, /* Pointer to cell-pointer area */ 007577 int iFirst, /* Index of first cell to add */ 007578 int nCell, /* Number of cells to add to pPg */ 007579 CellArray *pCArray /* Array of cells */ 007580 ){ 007581 int i = iFirst; /* Loop counter - cell index to insert */ 007582 u8 *aData = pPg->aData; /* Complete page */ 007583 u8 *pData = *ppData; /* Content area. A subset of aData[] */ 007584 int iEnd = iFirst + nCell; /* End of loop. One past last cell to ins */ 007585 int k; /* Current slot in pCArray->apEnd[] */ 007586 u8 *pEnd; /* Maximum extent of cell data */ 007587 assert( CORRUPT_DB || pPg->hdrOffset==0 ); /* Never called on page 1 */ 007588 if( iEnd<=iFirst ) return 0; 007589 for(k=0; ALWAYS(k<NB*2) && pCArray->ixNx[k]<=i ; k++){} 007590 pEnd = pCArray->apEnd[k]; 007591 while( 1 /*Exit by break*/ ){ 007592 int sz, rc; 007593 u8 *pSlot; 007594 assert( pCArray->szCell[i]!=0 ); 007595 sz = pCArray->szCell[i]; 007596 if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){ 007597 if( (pData - pBegin)<sz ) return 1; 007598 pData -= sz; 007599 pSlot = pData; 007600 } 007601 /* pSlot and pCArray->apCell[i] will never overlap on a well-formed 007602 ** database. But they might for a corrupt database. Hence use memmove() 007603 ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */ 007604 assert( (pSlot+sz)<=pCArray->apCell[i] 007605 || pSlot>=(pCArray->apCell[i]+sz) 007606 || CORRUPT_DB ); 007607 if( (uptr)(pCArray->apCell[i]+sz)>(uptr)pEnd 007608 && (uptr)(pCArray->apCell[i])<(uptr)pEnd 007609 ){ 007610 assert( CORRUPT_DB ); 007611 (void)SQLITE_CORRUPT_BKPT; 007612 return 1; 007613 } 007614 memmove(pSlot, pCArray->apCell[i], sz); 007615 put2byte(pCellptr, (pSlot - aData)); 007616 pCellptr += 2; 007617 i++; 007618 if( i>=iEnd ) break; 007619 if( pCArray->ixNx[k]<=i ){ 007620 k++; 007621 pEnd = pCArray->apEnd[k]; 007622 } 007623 } 007624 *ppData = pData; 007625 return 0; 007626 } 007627 007628 /* 007629 ** The pCArray object contains pointers to b-tree cells and their sizes. 007630 ** 007631 ** This function adds the space associated with each cell in the array 007632 ** that is currently stored within the body of pPg to the pPg free-list. 007633 ** The cell-pointers and other fields of the page are not updated. 007634 ** 007635 ** This function returns the total number of cells added to the free-list. 007636 */ 007637 static int pageFreeArray( 007638 MemPage *pPg, /* Page to edit */ 007639 int iFirst, /* First cell to delete */ 007640 int nCell, /* Cells to delete */ 007641 CellArray *pCArray /* Array of cells */ 007642 ){ 007643 u8 * const aData = pPg->aData; 007644 u8 * const pEnd = &aData[pPg->pBt->usableSize]; 007645 u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize]; 007646 int nRet = 0; 007647 int i, j; 007648 int iEnd = iFirst + nCell; 007649 int nFree = 0; 007650 int aOfst[10]; 007651 int aAfter[10]; 007652 007653 for(i=iFirst; i<iEnd; i++){ 007654 u8 *pCell = pCArray->apCell[i]; 007655 if( SQLITE_WITHIN(pCell, pStart, pEnd) ){ 007656 int sz; 007657 int iAfter; 007658 int iOfst; 007659 /* No need to use cachedCellSize() here. The sizes of all cells that 007660 ** are to be freed have already been computing while deciding which 007661 ** cells need freeing */ 007662 sz = pCArray->szCell[i]; assert( sz>0 ); 007663 iOfst = (u16)(pCell - aData); 007664 iAfter = iOfst+sz; 007665 for(j=0; j<nFree; j++){ 007666 if( aOfst[j]==iAfter ){ 007667 aOfst[j] = iOfst; 007668 break; 007669 }else if( aAfter[j]==iOfst ){ 007670 aAfter[j] = iAfter; 007671 break; 007672 } 007673 } 007674 if( j>=nFree ){ 007675 if( nFree>=(int)(sizeof(aOfst)/sizeof(aOfst[0])) ){ 007676 for(j=0; j<nFree; j++){ 007677 freeSpace(pPg, aOfst[j], aAfter[j]-aOfst[j]); 007678 } 007679 nFree = 0; 007680 } 007681 aOfst[nFree] = iOfst; 007682 aAfter[nFree] = iAfter; 007683 if( &aData[iAfter]>pEnd ) return 0; 007684 nFree++; 007685 } 007686 nRet++; 007687 } 007688 } 007689 for(j=0; j<nFree; j++){ 007690 freeSpace(pPg, aOfst[j], aAfter[j]-aOfst[j]); 007691 } 007692 return nRet; 007693 } 007694 007695 /* 007696 ** pCArray contains pointers to and sizes of all cells in the page being 007697 ** balanced. The current page, pPg, has pPg->nCell cells starting with 007698 ** pCArray->apCell[iOld]. After balancing, this page should hold nNew cells 007699 ** starting at apCell[iNew]. 007700 ** 007701 ** This routine makes the necessary adjustments to pPg so that it contains 007702 ** the correct cells after being balanced. 007703 ** 007704 ** The pPg->nFree field is invalid when this function returns. It is the 007705 ** responsibility of the caller to set it correctly. 007706 */ 007707 static int editPage( 007708 MemPage *pPg, /* Edit this page */ 007709 int iOld, /* Index of first cell currently on page */ 007710 int iNew, /* Index of new first cell on page */ 007711 int nNew, /* Final number of cells on page */ 007712 CellArray *pCArray /* Array of cells and sizes */ 007713 ){ 007714 u8 * const aData = pPg->aData; 007715 const int hdr = pPg->hdrOffset; 007716 u8 *pBegin = &pPg->aCellIdx[nNew * 2]; 007717 int nCell = pPg->nCell; /* Cells stored on pPg */ 007718 u8 *pData; 007719 u8 *pCellptr; 007720 int i; 007721 int iOldEnd = iOld + pPg->nCell + pPg->nOverflow; 007722 int iNewEnd = iNew + nNew; 007723 007724 #ifdef SQLITE_DEBUG 007725 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); 007726 memcpy(pTmp, aData, pPg->pBt->usableSize); 007727 #endif 007728 007729 /* Remove cells from the start and end of the page */ 007730 assert( nCell>=0 ); 007731 if( iOld<iNew ){ 007732 int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray); 007733 if( NEVER(nShift>nCell) ) return SQLITE_CORRUPT_BKPT; 007734 memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2); 007735 nCell -= nShift; 007736 } 007737 if( iNewEnd < iOldEnd ){ 007738 int nTail = pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray); 007739 assert( nCell>=nTail ); 007740 nCell -= nTail; 007741 } 007742 007743 pData = &aData[get2byte(&aData[hdr+5])]; 007744 if( pData<pBegin ) goto editpage_fail; 007745 if( NEVER(pData>pPg->aDataEnd) ) goto editpage_fail; 007746 007747 /* Add cells to the start of the page */ 007748 if( iNew<iOld ){ 007749 int nAdd = MIN(nNew,iOld-iNew); 007750 assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB ); 007751 assert( nAdd>=0 ); 007752 pCellptr = pPg->aCellIdx; 007753 memmove(&pCellptr[nAdd*2], pCellptr, nCell*2); 007754 if( pageInsertArray( 007755 pPg, pBegin, &pData, pCellptr, 007756 iNew, nAdd, pCArray 007757 ) ) goto editpage_fail; 007758 nCell += nAdd; 007759 } 007760 007761 /* Add any overflow cells */ 007762 for(i=0; i<pPg->nOverflow; i++){ 007763 int iCell = (iOld + pPg->aiOvfl[i]) - iNew; 007764 if( iCell>=0 && iCell<nNew ){ 007765 pCellptr = &pPg->aCellIdx[iCell * 2]; 007766 if( nCell>iCell ){ 007767 memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2); 007768 } 007769 nCell++; 007770 cachedCellSize(pCArray, iCell+iNew); 007771 if( pageInsertArray( 007772 pPg, pBegin, &pData, pCellptr, 007773 iCell+iNew, 1, pCArray 007774 ) ) goto editpage_fail; 007775 } 007776 } 007777 007778 /* Append cells to the end of the page */ 007779 assert( nCell>=0 ); 007780 pCellptr = &pPg->aCellIdx[nCell*2]; 007781 if( pageInsertArray( 007782 pPg, pBegin, &pData, pCellptr, 007783 iNew+nCell, nNew-nCell, pCArray 007784 ) ) goto editpage_fail; 007785 007786 pPg->nCell = nNew; 007787 pPg->nOverflow = 0; 007788 007789 put2byte(&aData[hdr+3], pPg->nCell); 007790 put2byte(&aData[hdr+5], pData - aData); 007791 007792 #ifdef SQLITE_DEBUG 007793 for(i=0; i<nNew && !CORRUPT_DB; i++){ 007794 u8 *pCell = pCArray->apCell[i+iNew]; 007795 int iOff = get2byteAligned(&pPg->aCellIdx[i*2]); 007796 if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){ 007797 pCell = &pTmp[pCell - aData]; 007798 } 007799 assert( 0==memcmp(pCell, &aData[iOff], 007800 pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) ); 007801 } 007802 #endif 007803 007804 return SQLITE_OK; 007805 editpage_fail: 007806 /* Unable to edit this page. Rebuild it from scratch instead. */ 007807 if( nNew<1 ) return SQLITE_CORRUPT_BKPT; 007808 populateCellCache(pCArray, iNew, nNew); 007809 return rebuildPage(pCArray, iNew, nNew, pPg); 007810 } 007811 007812 007813 #ifndef SQLITE_OMIT_QUICKBALANCE 007814 /* 007815 ** This version of balance() handles the common special case where 007816 ** a new entry is being inserted on the extreme right-end of the 007817 ** tree, in other words, when the new entry will become the largest 007818 ** entry in the tree. 007819 ** 007820 ** Instead of trying to balance the 3 right-most leaf pages, just add 007821 ** a new page to the right-hand side and put the one new entry in 007822 ** that page. This leaves the right side of the tree somewhat 007823 ** unbalanced. But odds are that we will be inserting new entries 007824 ** at the end soon afterwards so the nearly empty page will quickly 007825 ** fill up. On average. 007826 ** 007827 ** pPage is the leaf page which is the right-most page in the tree. 007828 ** pParent is its parent. pPage must have a single overflow entry 007829 ** which is also the right-most entry on the page. 007830 ** 007831 ** The pSpace buffer is used to store a temporary copy of the divider 007832 ** cell that will be inserted into pParent. Such a cell consists of a 4 007833 ** byte page number followed by a variable length integer. In other 007834 ** words, at most 13 bytes. Hence the pSpace buffer must be at 007835 ** least 13 bytes in size. 007836 */ 007837 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ 007838 BtShared *const pBt = pPage->pBt; /* B-Tree Database */ 007839 MemPage *pNew; /* Newly allocated page */ 007840 int rc; /* Return Code */ 007841 Pgno pgnoNew; /* Page number of pNew */ 007842 007843 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007844 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 007845 assert( pPage->nOverflow==1 ); 007846 007847 if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT; /* dbfuzz001.test */ 007848 assert( pPage->nFree>=0 ); 007849 assert( pParent->nFree>=0 ); 007850 007851 /* Allocate a new page. This page will become the right-sibling of 007852 ** pPage. Make the parent page writable, so that the new divider cell 007853 ** may be inserted. If both these operations are successful, proceed. 007854 */ 007855 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 007856 007857 if( rc==SQLITE_OK ){ 007858 007859 u8 *pOut = &pSpace[4]; 007860 u8 *pCell = pPage->apOvfl[0]; 007861 u16 szCell = pPage->xCellSize(pPage, pCell); 007862 u8 *pStop; 007863 CellArray b; 007864 007865 assert( sqlite3PagerIswriteable(pNew->pDbPage) ); 007866 assert( CORRUPT_DB || pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); 007867 zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); 007868 b.nCell = 1; 007869 b.pRef = pPage; 007870 b.apCell = &pCell; 007871 b.szCell = &szCell; 007872 b.apEnd[0] = pPage->aDataEnd; 007873 b.ixNx[0] = 2; 007874 rc = rebuildPage(&b, 0, 1, pNew); 007875 if( NEVER(rc) ){ 007876 releasePage(pNew); 007877 return rc; 007878 } 007879 pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell; 007880 007881 /* If this is an auto-vacuum database, update the pointer map 007882 ** with entries for the new page, and any pointer from the 007883 ** cell on the page to an overflow page. If either of these 007884 ** operations fails, the return code is set, but the contents 007885 ** of the parent page are still manipulated by the code below. 007886 ** That is Ok, at this point the parent page is guaranteed to 007887 ** be marked as dirty. Returning an error code will cause a 007888 ** rollback, undoing any changes made to the parent page. 007889 */ 007890 if( ISAUTOVACUUM(pBt) ){ 007891 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc); 007892 if( szCell>pNew->minLocal ){ 007893 ptrmapPutOvflPtr(pNew, pNew, pCell, &rc); 007894 } 007895 } 007896 007897 /* Create a divider cell to insert into pParent. The divider cell 007898 ** consists of a 4-byte page number (the page number of pPage) and 007899 ** a variable length key value (which must be the same value as the 007900 ** largest key on pPage). 007901 ** 007902 ** To find the largest key value on pPage, first find the right-most 007903 ** cell on pPage. The first two fields of this cell are the 007904 ** record-length (a variable length integer at most 32-bits in size) 007905 ** and the key value (a variable length integer, may have any value). 007906 ** The first of the while(...) loops below skips over the record-length 007907 ** field. The second while(...) loop copies the key value from the 007908 ** cell on pPage into the pSpace buffer. 007909 */ 007910 pCell = findCell(pPage, pPage->nCell-1); 007911 pStop = &pCell[9]; 007912 while( (*(pCell++)&0x80) && pCell<pStop ); 007913 pStop = &pCell[9]; 007914 while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop ); 007915 007916 /* Insert the new divider cell into pParent. */ 007917 if( rc==SQLITE_OK ){ 007918 rc = insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace), 007919 0, pPage->pgno); 007920 } 007921 007922 /* Set the right-child pointer of pParent to point to the new page. */ 007923 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); 007924 007925 /* Release the reference to the new page. */ 007926 releasePage(pNew); 007927 } 007928 007929 return rc; 007930 } 007931 #endif /* SQLITE_OMIT_QUICKBALANCE */ 007932 007933 #if 0 007934 /* 007935 ** This function does not contribute anything to the operation of SQLite. 007936 ** it is sometimes activated temporarily while debugging code responsible 007937 ** for setting pointer-map entries. 007938 */ 007939 static int ptrmapCheckPages(MemPage **apPage, int nPage){ 007940 int i, j; 007941 for(i=0; i<nPage; i++){ 007942 Pgno n; 007943 u8 e; 007944 MemPage *pPage = apPage[i]; 007945 BtShared *pBt = pPage->pBt; 007946 assert( pPage->isInit ); 007947 007948 for(j=0; j<pPage->nCell; j++){ 007949 CellInfo info; 007950 u8 *z; 007951 007952 z = findCell(pPage, j); 007953 pPage->xParseCell(pPage, z, &info); 007954 if( info.nLocal<info.nPayload ){ 007955 Pgno ovfl = get4byte(&z[info.nSize-4]); 007956 ptrmapGet(pBt, ovfl, &e, &n); 007957 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 ); 007958 } 007959 if( !pPage->leaf ){ 007960 Pgno child = get4byte(z); 007961 ptrmapGet(pBt, child, &e, &n); 007962 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 007963 } 007964 } 007965 if( !pPage->leaf ){ 007966 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]); 007967 ptrmapGet(pBt, child, &e, &n); 007968 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 007969 } 007970 } 007971 return 1; 007972 } 007973 #endif 007974 007975 /* 007976 ** This function is used to copy the contents of the b-tree node stored 007977 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then 007978 ** the pointer-map entries for each child page are updated so that the 007979 ** parent page stored in the pointer map is page pTo. If pFrom contained 007980 ** any cells with overflow page pointers, then the corresponding pointer 007981 ** map entries are also updated so that the parent page is page pTo. 007982 ** 007983 ** If pFrom is currently carrying any overflow cells (entries in the 007984 ** MemPage.apOvfl[] array), they are not copied to pTo. 007985 ** 007986 ** Before returning, page pTo is reinitialized using btreeInitPage(). 007987 ** 007988 ** The performance of this function is not critical. It is only used by 007989 ** the balance_shallower() and balance_deeper() procedures, neither of 007990 ** which are called often under normal circumstances. 007991 */ 007992 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){ 007993 if( (*pRC)==SQLITE_OK ){ 007994 BtShared * const pBt = pFrom->pBt; 007995 u8 * const aFrom = pFrom->aData; 007996 u8 * const aTo = pTo->aData; 007997 int const iFromHdr = pFrom->hdrOffset; 007998 int const iToHdr = ((pTo->pgno==1) ? 100 : 0); 007999 int rc; 008000 int iData; 008001 008002 008003 assert( pFrom->isInit ); 008004 assert( pFrom->nFree>=iToHdr ); 008005 assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize ); 008006 008007 /* Copy the b-tree node content from page pFrom to page pTo. */ 008008 iData = get2byte(&aFrom[iFromHdr+5]); 008009 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData); 008010 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell); 008011 008012 /* Reinitialize page pTo so that the contents of the MemPage structure 008013 ** match the new data. The initialization of pTo can actually fail under 008014 ** fairly obscure circumstances, even though it is a copy of initialized 008015 ** page pFrom. 008016 */ 008017 pTo->isInit = 0; 008018 rc = btreeInitPage(pTo); 008019 if( rc==SQLITE_OK ) rc = btreeComputeFreeSpace(pTo); 008020 if( rc!=SQLITE_OK ){ 008021 *pRC = rc; 008022 return; 008023 } 008024 008025 /* If this is an auto-vacuum database, update the pointer-map entries 008026 ** for any b-tree or overflow pages that pTo now contains the pointers to. 008027 */ 008028 if( ISAUTOVACUUM(pBt) ){ 008029 *pRC = setChildPtrmaps(pTo); 008030 } 008031 } 008032 } 008033 008034 /* 008035 ** This routine redistributes cells on the iParentIdx'th child of pParent 008036 ** (hereafter "the page") and up to 2 siblings so that all pages have about the 008037 ** same amount of free space. Usually a single sibling on either side of the 008038 ** page are used in the balancing, though both siblings might come from one 008039 ** side if the page is the first or last child of its parent. If the page 008040 ** has fewer than 2 siblings (something which can only happen if the page 008041 ** is a root page or a child of a root page) then all available siblings 008042 ** participate in the balancing. 008043 ** 008044 ** The number of siblings of the page might be increased or decreased by 008045 ** one or two in an effort to keep pages nearly full but not over full. 008046 ** 008047 ** Note that when this routine is called, some of the cells on the page 008048 ** might not actually be stored in MemPage.aData[]. This can happen 008049 ** if the page is overfull. This routine ensures that all cells allocated 008050 ** to the page and its siblings fit into MemPage.aData[] before returning. 008051 ** 008052 ** In the course of balancing the page and its siblings, cells may be 008053 ** inserted into or removed from the parent page (pParent). Doing so 008054 ** may cause the parent page to become overfull or underfull. If this 008055 ** happens, it is the responsibility of the caller to invoke the correct 008056 ** balancing routine to fix this problem (see the balance() routine). 008057 ** 008058 ** If this routine fails for any reason, it might leave the database 008059 ** in a corrupted state. So if this routine fails, the database should 008060 ** be rolled back. 008061 ** 008062 ** The third argument to this function, aOvflSpace, is a pointer to a 008063 ** buffer big enough to hold one page. If while inserting cells into the parent 008064 ** page (pParent) the parent page becomes overfull, this buffer is 008065 ** used to store the parent's overflow cells. Because this function inserts 008066 ** a maximum of four divider cells into the parent page, and the maximum 008067 ** size of a cell stored within an internal node is always less than 1/4 008068 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large 008069 ** enough for all overflow cells. 008070 ** 008071 ** If aOvflSpace is set to a null pointer, this function returns 008072 ** SQLITE_NOMEM. 008073 */ 008074 static int balance_nonroot( 008075 MemPage *pParent, /* Parent page of siblings being balanced */ 008076 int iParentIdx, /* Index of "the page" in pParent */ 008077 u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */ 008078 int isRoot, /* True if pParent is a root-page */ 008079 int bBulk /* True if this call is part of a bulk load */ 008080 ){ 008081 BtShared *pBt; /* The whole database */ 008082 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ 008083 int nNew = 0; /* Number of pages in apNew[] */ 008084 int nOld; /* Number of pages in apOld[] */ 008085 int i, j, k; /* Loop counters */ 008086 int nxDiv; /* Next divider slot in pParent->aCell[] */ 008087 int rc = SQLITE_OK; /* The return code */ 008088 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */ 008089 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ 008090 int usableSpace; /* Bytes in pPage beyond the header */ 008091 int pageFlags; /* Value of pPage->aData[0] */ 008092 int iSpace1 = 0; /* First unused byte of aSpace1[] */ 008093 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */ 008094 int szScratch; /* Size of scratch memory requested */ 008095 MemPage *apOld[NB]; /* pPage and up to two siblings */ 008096 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ 008097 u8 *pRight; /* Location in parent of right-sibling pointer */ 008098 u8 *apDiv[NB-1]; /* Divider cells in pParent */ 008099 int cntNew[NB+2]; /* Index in b.paCell[] of cell after i-th page */ 008100 int cntOld[NB+2]; /* Old index in b.apCell[] */ 008101 int szNew[NB+2]; /* Combined size of cells placed on i-th page */ 008102 u8 *aSpace1; /* Space for copies of dividers cells */ 008103 Pgno pgno; /* Temp var to store a page number in */ 008104 u8 abDone[NB+2]; /* True after i'th new page is populated */ 008105 Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */ 008106 CellArray b; /* Parsed information on cells being balanced */ 008107 008108 memset(abDone, 0, sizeof(abDone)); 008109 memset(&b, 0, sizeof(b)); 008110 pBt = pParent->pBt; 008111 assert( sqlite3_mutex_held(pBt->mutex) ); 008112 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 008113 008114 /* At this point pParent may have at most one overflow cell. And if 008115 ** this overflow cell is present, it must be the cell with 008116 ** index iParentIdx. This scenario comes about when this function 008117 ** is called (indirectly) from sqlite3BtreeDelete(). 008118 */ 008119 assert( pParent->nOverflow==0 || pParent->nOverflow==1 ); 008120 assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx ); 008121 008122 if( !aOvflSpace ){ 008123 return SQLITE_NOMEM_BKPT; 008124 } 008125 assert( pParent->nFree>=0 ); 008126 008127 /* Find the sibling pages to balance. Also locate the cells in pParent 008128 ** that divide the siblings. An attempt is made to find NN siblings on 008129 ** either side of pPage. More siblings are taken from one side, however, 008130 ** if there are fewer than NN siblings on the other side. If pParent 008131 ** has NB or fewer children then all children of pParent are taken. 008132 ** 008133 ** This loop also drops the divider cells from the parent page. This 008134 ** way, the remainder of the function does not have to deal with any 008135 ** overflow cells in the parent page, since if any existed they will 008136 ** have already been removed. 008137 */ 008138 i = pParent->nOverflow + pParent->nCell; 008139 if( i<2 ){ 008140 nxDiv = 0; 008141 }else{ 008142 assert( bBulk==0 || bBulk==1 ); 008143 if( iParentIdx==0 ){ 008144 nxDiv = 0; 008145 }else if( iParentIdx==i ){ 008146 nxDiv = i-2+bBulk; 008147 }else{ 008148 nxDiv = iParentIdx-1; 008149 } 008150 i = 2-bBulk; 008151 } 008152 nOld = i+1; 008153 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){ 008154 pRight = &pParent->aData[pParent->hdrOffset+8]; 008155 }else{ 008156 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow); 008157 } 008158 pgno = get4byte(pRight); 008159 while( 1 ){ 008160 if( rc==SQLITE_OK ){ 008161 rc = getAndInitPage(pBt, pgno, &apOld[i], 0); 008162 } 008163 if( rc ){ 008164 memset(apOld, 0, (i+1)*sizeof(MemPage*)); 008165 goto balance_cleanup; 008166 } 008167 if( apOld[i]->nFree<0 ){ 008168 rc = btreeComputeFreeSpace(apOld[i]); 008169 if( rc ){ 008170 memset(apOld, 0, (i)*sizeof(MemPage*)); 008171 goto balance_cleanup; 008172 } 008173 } 008174 nMaxCells += apOld[i]->nCell + ArraySize(pParent->apOvfl); 008175 if( (i--)==0 ) break; 008176 008177 if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){ 008178 apDiv[i] = pParent->apOvfl[0]; 008179 pgno = get4byte(apDiv[i]); 008180 szNew[i] = pParent->xCellSize(pParent, apDiv[i]); 008181 pParent->nOverflow = 0; 008182 }else{ 008183 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow); 008184 pgno = get4byte(apDiv[i]); 008185 szNew[i] = pParent->xCellSize(pParent, apDiv[i]); 008186 008187 /* Drop the cell from the parent page. apDiv[i] still points to 008188 ** the cell within the parent, even though it has been dropped. 008189 ** This is safe because dropping a cell only overwrites the first 008190 ** four bytes of it, and this function does not need the first 008191 ** four bytes of the divider cell. So the pointer is safe to use 008192 ** later on. 008193 ** 008194 ** But not if we are in secure-delete mode. In secure-delete mode, 008195 ** the dropCell() routine will overwrite the entire cell with zeroes. 008196 ** In this case, temporarily copy the cell into the aOvflSpace[] 008197 ** buffer. It will be copied out again as soon as the aSpace[] buffer 008198 ** is allocated. */ 008199 if( pBt->btsFlags & BTS_FAST_SECURE ){ 008200 int iOff; 008201 008202 /* If the following if() condition is not true, the db is corrupted. 008203 ** The call to dropCell() below will detect this. */ 008204 iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData); 008205 if( (iOff+szNew[i])<=(int)pBt->usableSize ){ 008206 memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]); 008207 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData]; 008208 } 008209 } 008210 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc); 008211 } 008212 } 008213 008214 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte 008215 ** alignment */ 008216 nMaxCells = (nMaxCells + 3)&~3; 008217 008218 /* 008219 ** Allocate space for memory structures 008220 */ 008221 szScratch = 008222 nMaxCells*sizeof(u8*) /* b.apCell */ 008223 + nMaxCells*sizeof(u16) /* b.szCell */ 008224 + pBt->pageSize; /* aSpace1 */ 008225 008226 assert( szScratch<=7*(int)pBt->pageSize ); 008227 b.apCell = sqlite3StackAllocRaw(0, szScratch ); 008228 if( b.apCell==0 ){ 008229 rc = SQLITE_NOMEM_BKPT; 008230 goto balance_cleanup; 008231 } 008232 b.szCell = (u16*)&b.apCell[nMaxCells]; 008233 aSpace1 = (u8*)&b.szCell[nMaxCells]; 008234 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); 008235 008236 /* 008237 ** Load pointers to all cells on sibling pages and the divider cells 008238 ** into the local b.apCell[] array. Make copies of the divider cells 008239 ** into space obtained from aSpace1[]. The divider cells have already 008240 ** been removed from pParent. 008241 ** 008242 ** If the siblings are on leaf pages, then the child pointers of the 008243 ** divider cells are stripped from the cells before they are copied 008244 ** into aSpace1[]. In this way, all cells in b.apCell[] are without 008245 ** child pointers. If siblings are not leaves, then all cell in 008246 ** b.apCell[] include child pointers. Either way, all cells in b.apCell[] 008247 ** are alike. 008248 ** 008249 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. 008250 ** leafData: 1 if pPage holds key+data and pParent holds only keys. 008251 */ 008252 b.pRef = apOld[0]; 008253 leafCorrection = b.pRef->leaf*4; 008254 leafData = b.pRef->intKeyLeaf; 008255 for(i=0; i<nOld; i++){ 008256 MemPage *pOld = apOld[i]; 008257 int limit = pOld->nCell; 008258 u8 *aData = pOld->aData; 008259 u16 maskPage = pOld->maskPage; 008260 u8 *piCell = aData + pOld->cellOffset; 008261 u8 *piEnd; 008262 VVA_ONLY( int nCellAtStart = b.nCell; ) 008263 008264 /* Verify that all sibling pages are of the same "type" (table-leaf, 008265 ** table-interior, index-leaf, or index-interior). 008266 */ 008267 if( pOld->aData[0]!=apOld[0]->aData[0] ){ 008268 rc = SQLITE_CORRUPT_BKPT; 008269 goto balance_cleanup; 008270 } 008271 008272 /* Load b.apCell[] with pointers to all cells in pOld. If pOld 008273 ** contains overflow cells, include them in the b.apCell[] array 008274 ** in the correct spot. 008275 ** 008276 ** Note that when there are multiple overflow cells, it is always the 008277 ** case that they are sequential and adjacent. This invariant arises 008278 ** because multiple overflows can only occurs when inserting divider 008279 ** cells into a parent on a prior balance, and divider cells are always 008280 ** adjacent and are inserted in order. There is an assert() tagged 008281 ** with "NOTE 1" in the overflow cell insertion loop to prove this 008282 ** invariant. 008283 ** 008284 ** This must be done in advance. Once the balance starts, the cell 008285 ** offset section of the btree page will be overwritten and we will no 008286 ** long be able to find the cells if a pointer to each cell is not saved 008287 ** first. 008288 */ 008289 memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow)); 008290 if( pOld->nOverflow>0 ){ 008291 if( NEVER(limit<pOld->aiOvfl[0]) ){ 008292 rc = SQLITE_CORRUPT_BKPT; 008293 goto balance_cleanup; 008294 } 008295 limit = pOld->aiOvfl[0]; 008296 for(j=0; j<limit; j++){ 008297 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell)); 008298 piCell += 2; 008299 b.nCell++; 008300 } 008301 for(k=0; k<pOld->nOverflow; k++){ 008302 assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */ 008303 b.apCell[b.nCell] = pOld->apOvfl[k]; 008304 b.nCell++; 008305 } 008306 } 008307 piEnd = aData + pOld->cellOffset + 2*pOld->nCell; 008308 while( piCell<piEnd ){ 008309 assert( b.nCell<nMaxCells ); 008310 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell)); 008311 piCell += 2; 008312 b.nCell++; 008313 } 008314 assert( (b.nCell-nCellAtStart)==(pOld->nCell+pOld->nOverflow) ); 008315 008316 cntOld[i] = b.nCell; 008317 if( i<nOld-1 && !leafData){ 008318 u16 sz = (u16)szNew[i]; 008319 u8 *pTemp; 008320 assert( b.nCell<nMaxCells ); 008321 b.szCell[b.nCell] = sz; 008322 pTemp = &aSpace1[iSpace1]; 008323 iSpace1 += sz; 008324 assert( sz<=pBt->maxLocal+23 ); 008325 assert( iSpace1 <= (int)pBt->pageSize ); 008326 memcpy(pTemp, apDiv[i], sz); 008327 b.apCell[b.nCell] = pTemp+leafCorrection; 008328 assert( leafCorrection==0 || leafCorrection==4 ); 008329 b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection; 008330 if( !pOld->leaf ){ 008331 assert( leafCorrection==0 ); 008332 assert( pOld->hdrOffset==0 || CORRUPT_DB ); 008333 /* The right pointer of the child page pOld becomes the left 008334 ** pointer of the divider cell */ 008335 memcpy(b.apCell[b.nCell], &pOld->aData[8], 4); 008336 }else{ 008337 assert( leafCorrection==4 ); 008338 while( b.szCell[b.nCell]<4 ){ 008339 /* Do not allow any cells smaller than 4 bytes. If a smaller cell 008340 ** does exist, pad it with 0x00 bytes. */ 008341 assert( b.szCell[b.nCell]==3 || CORRUPT_DB ); 008342 assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB ); 008343 aSpace1[iSpace1++] = 0x00; 008344 b.szCell[b.nCell]++; 008345 } 008346 } 008347 b.nCell++; 008348 } 008349 } 008350 008351 /* 008352 ** Figure out the number of pages needed to hold all b.nCell cells. 008353 ** Store this number in "k". Also compute szNew[] which is the total 008354 ** size of all cells on the i-th page and cntNew[] which is the index 008355 ** in b.apCell[] of the cell that divides page i from page i+1. 008356 ** cntNew[k] should equal b.nCell. 008357 ** 008358 ** Values computed by this block: 008359 ** 008360 ** k: The total number of sibling pages 008361 ** szNew[i]: Spaced used on the i-th sibling page. 008362 ** cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to 008363 ** the right of the i-th sibling page. 008364 ** usableSpace: Number of bytes of space available on each sibling. 008365 ** 008366 */ 008367 usableSpace = pBt->usableSize - 12 + leafCorrection; 008368 for(i=k=0; i<nOld; i++, k++){ 008369 MemPage *p = apOld[i]; 008370 b.apEnd[k] = p->aDataEnd; 008371 b.ixNx[k] = cntOld[i]; 008372 if( k && b.ixNx[k]==b.ixNx[k-1] ){ 008373 k--; /* Omit b.ixNx[] entry for child pages with no cells */ 008374 } 008375 if( !leafData ){ 008376 k++; 008377 b.apEnd[k] = pParent->aDataEnd; 008378 b.ixNx[k] = cntOld[i]+1; 008379 } 008380 assert( p->nFree>=0 ); 008381 szNew[i] = usableSpace - p->nFree; 008382 for(j=0; j<p->nOverflow; j++){ 008383 szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]); 008384 } 008385 cntNew[i] = cntOld[i]; 008386 } 008387 k = nOld; 008388 for(i=0; i<k; i++){ 008389 int sz; 008390 while( szNew[i]>usableSpace ){ 008391 if( i+1>=k ){ 008392 k = i+2; 008393 if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } 008394 szNew[k-1] = 0; 008395 cntNew[k-1] = b.nCell; 008396 } 008397 sz = 2 + cachedCellSize(&b, cntNew[i]-1); 008398 szNew[i] -= sz; 008399 if( !leafData ){ 008400 if( cntNew[i]<b.nCell ){ 008401 sz = 2 + cachedCellSize(&b, cntNew[i]); 008402 }else{ 008403 sz = 0; 008404 } 008405 } 008406 szNew[i+1] += sz; 008407 cntNew[i]--; 008408 } 008409 while( cntNew[i]<b.nCell ){ 008410 sz = 2 + cachedCellSize(&b, cntNew[i]); 008411 if( szNew[i]+sz>usableSpace ) break; 008412 szNew[i] += sz; 008413 cntNew[i]++; 008414 if( !leafData ){ 008415 if( cntNew[i]<b.nCell ){ 008416 sz = 2 + cachedCellSize(&b, cntNew[i]); 008417 }else{ 008418 sz = 0; 008419 } 008420 } 008421 szNew[i+1] -= sz; 008422 } 008423 if( cntNew[i]>=b.nCell ){ 008424 k = i+1; 008425 }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){ 008426 rc = SQLITE_CORRUPT_BKPT; 008427 goto balance_cleanup; 008428 } 008429 } 008430 008431 /* 008432 ** The packing computed by the previous block is biased toward the siblings 008433 ** on the left side (siblings with smaller keys). The left siblings are 008434 ** always nearly full, while the right-most sibling might be nearly empty. 008435 ** The next block of code attempts to adjust the packing of siblings to 008436 ** get a better balance. 008437 ** 008438 ** This adjustment is more than an optimization. The packing above might 008439 ** be so out of balance as to be illegal. For example, the right-most 008440 ** sibling might be completely empty. This adjustment is not optional. 008441 */ 008442 for(i=k-1; i>0; i--){ 008443 int szRight = szNew[i]; /* Size of sibling on the right */ 008444 int szLeft = szNew[i-1]; /* Size of sibling on the left */ 008445 int r; /* Index of right-most cell in left sibling */ 008446 int d; /* Index of first cell to the left of right sibling */ 008447 008448 r = cntNew[i-1] - 1; 008449 d = r + 1 - leafData; 008450 (void)cachedCellSize(&b, d); 008451 do{ 008452 int szR, szD; 008453 assert( d<nMaxCells ); 008454 assert( r<nMaxCells ); 008455 szR = cachedCellSize(&b, r); 008456 szD = b.szCell[d]; 008457 if( szRight!=0 008458 && (bBulk || szRight+szD+2 > szLeft-(szR+(i==k-1?0:2)))){ 008459 break; 008460 } 008461 szRight += szD + 2; 008462 szLeft -= szR + 2; 008463 cntNew[i-1] = r; 008464 r--; 008465 d--; 008466 }while( r>=0 ); 008467 szNew[i] = szRight; 008468 szNew[i-1] = szLeft; 008469 if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){ 008470 rc = SQLITE_CORRUPT_BKPT; 008471 goto balance_cleanup; 008472 } 008473 } 008474 008475 /* Sanity check: For a non-corrupt database file one of the following 008476 ** must be true: 008477 ** (1) We found one or more cells (cntNew[0])>0), or 008478 ** (2) pPage is a virtual root page. A virtual root page is when 008479 ** the real root page is page 1 and we are the only child of 008480 ** that page. 008481 */ 008482 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB); 008483 TRACE(("BALANCE: old: %u(nc=%u) %u(nc=%u) %u(nc=%u)\n", 008484 apOld[0]->pgno, apOld[0]->nCell, 008485 nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0, 008486 nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0 008487 )); 008488 008489 /* 008490 ** Allocate k new pages. Reuse old pages where possible. 008491 */ 008492 pageFlags = apOld[0]->aData[0]; 008493 for(i=0; i<k; i++){ 008494 MemPage *pNew; 008495 if( i<nOld ){ 008496 pNew = apNew[i] = apOld[i]; 008497 apOld[i] = 0; 008498 rc = sqlite3PagerWrite(pNew->pDbPage); 008499 nNew++; 008500 if( sqlite3PagerPageRefcount(pNew->pDbPage)!=1+(i==(iParentIdx-nxDiv)) 008501 && rc==SQLITE_OK 008502 ){ 008503 rc = SQLITE_CORRUPT_BKPT; 008504 } 008505 if( rc ) goto balance_cleanup; 008506 }else{ 008507 assert( i>0 ); 008508 rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0); 008509 if( rc ) goto balance_cleanup; 008510 zeroPage(pNew, pageFlags); 008511 apNew[i] = pNew; 008512 nNew++; 008513 cntOld[i] = b.nCell; 008514 008515 /* Set the pointer-map entry for the new sibling page. */ 008516 if( ISAUTOVACUUM(pBt) ){ 008517 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc); 008518 if( rc!=SQLITE_OK ){ 008519 goto balance_cleanup; 008520 } 008521 } 008522 } 008523 } 008524 008525 /* 008526 ** Reassign page numbers so that the new pages are in ascending order. 008527 ** This helps to keep entries in the disk file in order so that a scan 008528 ** of the table is closer to a linear scan through the file. That in turn 008529 ** helps the operating system to deliver pages from the disk more rapidly. 008530 ** 008531 ** An O(N*N) sort algorithm is used, but since N is never more than NB+2 008532 ** (5), that is not a performance concern. 008533 ** 008534 ** When NB==3, this one optimization makes the database about 25% faster 008535 ** for large insertions and deletions. 008536 */ 008537 for(i=0; i<nNew; i++){ 008538 aPgno[i] = apNew[i]->pgno; 008539 assert( apNew[i]->pDbPage->flags & PGHDR_WRITEABLE ); 008540 assert( apNew[i]->pDbPage->flags & PGHDR_DIRTY ); 008541 } 008542 for(i=0; i<nNew-1; i++){ 008543 int iB = i; 008544 for(j=i+1; j<nNew; j++){ 008545 if( apNew[j]->pgno < apNew[iB]->pgno ) iB = j; 008546 } 008547 008548 /* If apNew[i] has a page number that is bigger than any of the 008549 ** subsequence apNew[i] entries, then swap apNew[i] with the subsequent 008550 ** entry that has the smallest page number (which we know to be 008551 ** entry apNew[iB]). 008552 */ 008553 if( iB!=i ){ 008554 Pgno pgnoA = apNew[i]->pgno; 008555 Pgno pgnoB = apNew[iB]->pgno; 008556 Pgno pgnoTemp = (PENDING_BYTE/pBt->pageSize)+1; 008557 u16 fgA = apNew[i]->pDbPage->flags; 008558 u16 fgB = apNew[iB]->pDbPage->flags; 008559 sqlite3PagerRekey(apNew[i]->pDbPage, pgnoTemp, fgB); 008560 sqlite3PagerRekey(apNew[iB]->pDbPage, pgnoA, fgA); 008561 sqlite3PagerRekey(apNew[i]->pDbPage, pgnoB, fgB); 008562 apNew[i]->pgno = pgnoB; 008563 apNew[iB]->pgno = pgnoA; 008564 } 008565 } 008566 008567 TRACE(("BALANCE: new: %u(%u nc=%u) %u(%u nc=%u) %u(%u nc=%u) " 008568 "%u(%u nc=%u) %u(%u nc=%u)\n", 008569 apNew[0]->pgno, szNew[0], cntNew[0], 008570 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0, 008571 nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0, 008572 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0, 008573 nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0, 008574 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0, 008575 nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0, 008576 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0, 008577 nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0 008578 )); 008579 008580 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 008581 assert( nNew>=1 && nNew<=ArraySize(apNew) ); 008582 assert( apNew[nNew-1]!=0 ); 008583 put4byte(pRight, apNew[nNew-1]->pgno); 008584 008585 /* If the sibling pages are not leaves, ensure that the right-child pointer 008586 ** of the right-most new sibling page is set to the value that was 008587 ** originally in the same field of the right-most old sibling page. */ 008588 if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){ 008589 MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1]; 008590 memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4); 008591 } 008592 008593 /* Make any required updates to pointer map entries associated with 008594 ** cells stored on sibling pages following the balance operation. Pointer 008595 ** map entries associated with divider cells are set by the insertCell() 008596 ** routine. The associated pointer map entries are: 008597 ** 008598 ** a) if the cell contains a reference to an overflow chain, the 008599 ** entry associated with the first page in the overflow chain, and 008600 ** 008601 ** b) if the sibling pages are not leaves, the child page associated 008602 ** with the cell. 008603 ** 008604 ** If the sibling pages are not leaves, then the pointer map entry 008605 ** associated with the right-child of each sibling may also need to be 008606 ** updated. This happens below, after the sibling pages have been 008607 ** populated, not here. 008608 */ 008609 if( ISAUTOVACUUM(pBt) ){ 008610 MemPage *pOld; 008611 MemPage *pNew = pOld = apNew[0]; 008612 int cntOldNext = pNew->nCell + pNew->nOverflow; 008613 int iNew = 0; 008614 int iOld = 0; 008615 008616 for(i=0; i<b.nCell; i++){ 008617 u8 *pCell = b.apCell[i]; 008618 while( i==cntOldNext ){ 008619 iOld++; 008620 assert( iOld<nNew || iOld<nOld ); 008621 assert( iOld>=0 && iOld<NB ); 008622 pOld = iOld<nNew ? apNew[iOld] : apOld[iOld]; 008623 cntOldNext += pOld->nCell + pOld->nOverflow + !leafData; 008624 } 008625 if( i==cntNew[iNew] ){ 008626 pNew = apNew[++iNew]; 008627 if( !leafData ) continue; 008628 } 008629 008630 /* Cell pCell is destined for new sibling page pNew. Originally, it 008631 ** was either part of sibling page iOld (possibly an overflow cell), 008632 ** or else the divider cell to the left of sibling page iOld. So, 008633 ** if sibling page iOld had the same page number as pNew, and if 008634 ** pCell really was a part of sibling page iOld (not a divider or 008635 ** overflow cell), we can skip updating the pointer map entries. */ 008636 if( iOld>=nNew 008637 || pNew->pgno!=aPgno[iOld] 008638 || !SQLITE_WITHIN(pCell,pOld->aData,pOld->aDataEnd) 008639 ){ 008640 if( !leafCorrection ){ 008641 ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc); 008642 } 008643 if( cachedCellSize(&b,i)>pNew->minLocal ){ 008644 ptrmapPutOvflPtr(pNew, pOld, pCell, &rc); 008645 } 008646 if( rc ) goto balance_cleanup; 008647 } 008648 } 008649 } 008650 008651 /* Insert new divider cells into pParent. */ 008652 for(i=0; i<nNew-1; i++){ 008653 u8 *pCell; 008654 u8 *pTemp; 008655 int sz; 008656 u8 *pSrcEnd; 008657 MemPage *pNew = apNew[i]; 008658 j = cntNew[i]; 008659 008660 assert( j<nMaxCells ); 008661 assert( b.apCell[j]!=0 ); 008662 pCell = b.apCell[j]; 008663 sz = b.szCell[j] + leafCorrection; 008664 pTemp = &aOvflSpace[iOvflSpace]; 008665 if( !pNew->leaf ){ 008666 memcpy(&pNew->aData[8], pCell, 4); 008667 }else if( leafData ){ 008668 /* If the tree is a leaf-data tree, and the siblings are leaves, 008669 ** then there is no divider cell in b.apCell[]. Instead, the divider 008670 ** cell consists of the integer key for the right-most cell of 008671 ** the sibling-page assembled above only. 008672 */ 008673 CellInfo info; 008674 j--; 008675 pNew->xParseCell(pNew, b.apCell[j], &info); 008676 pCell = pTemp; 008677 sz = 4 + putVarint(&pCell[4], info.nKey); 008678 pTemp = 0; 008679 }else{ 008680 pCell -= 4; 008681 /* Obscure case for non-leaf-data trees: If the cell at pCell was 008682 ** previously stored on a leaf node, and its reported size was 4 008683 ** bytes, then it may actually be smaller than this 008684 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of 008685 ** any cell). But it is important to pass the correct size to 008686 ** insertCell(), so reparse the cell now. 008687 ** 008688 ** This can only happen for b-trees used to evaluate "IN (SELECT ...)" 008689 ** and WITHOUT ROWID tables with exactly one column which is the 008690 ** primary key. 008691 */ 008692 if( b.szCell[j]==4 ){ 008693 assert(leafCorrection==4); 008694 sz = pParent->xCellSize(pParent, pCell); 008695 } 008696 } 008697 iOvflSpace += sz; 008698 assert( sz<=pBt->maxLocal+23 ); 008699 assert( iOvflSpace <= (int)pBt->pageSize ); 008700 for(k=0; ALWAYS(k<NB*2) && b.ixNx[k]<=j; k++){} 008701 pSrcEnd = b.apEnd[k]; 008702 if( SQLITE_OVERFLOW(pSrcEnd, pCell, pCell+sz) ){ 008703 rc = SQLITE_CORRUPT_BKPT; 008704 goto balance_cleanup; 008705 } 008706 rc = insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno); 008707 if( rc!=SQLITE_OK ) goto balance_cleanup; 008708 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 008709 } 008710 008711 /* Now update the actual sibling pages. The order in which they are updated 008712 ** is important, as this code needs to avoid disrupting any page from which 008713 ** cells may still to be read. In practice, this means: 008714 ** 008715 ** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1]) 008716 ** then it is not safe to update page apNew[iPg] until after 008717 ** the left-hand sibling apNew[iPg-1] has been updated. 008718 ** 008719 ** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1]) 008720 ** then it is not safe to update page apNew[iPg] until after 008721 ** the right-hand sibling apNew[iPg+1] has been updated. 008722 ** 008723 ** If neither of the above apply, the page is safe to update. 008724 ** 008725 ** The iPg value in the following loop starts at nNew-1 goes down 008726 ** to 0, then back up to nNew-1 again, thus making two passes over 008727 ** the pages. On the initial downward pass, only condition (1) above 008728 ** needs to be tested because (2) will always be true from the previous 008729 ** step. On the upward pass, both conditions are always true, so the 008730 ** upwards pass simply processes pages that were missed on the downward 008731 ** pass. 008732 */ 008733 for(i=1-nNew; i<nNew; i++){ 008734 int iPg = i<0 ? -i : i; 008735 assert( iPg>=0 && iPg<nNew ); 008736 assert( iPg>=1 || i>=0 ); 008737 assert( iPg<ArraySize(cntOld) ); 008738 if( abDone[iPg] ) continue; /* Skip pages already processed */ 008739 if( i>=0 /* On the upwards pass, or... */ 008740 || cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */ 008741 ){ 008742 int iNew; 008743 int iOld; 008744 int nNewCell; 008745 008746 /* Verify condition (1): If cells are moving left, update iPg 008747 ** only after iPg-1 has already been updated. */ 008748 assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] ); 008749 008750 /* Verify condition (2): If cells are moving right, update iPg 008751 ** only after iPg+1 has already been updated. */ 008752 assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] ); 008753 008754 if( iPg==0 ){ 008755 iNew = iOld = 0; 008756 nNewCell = cntNew[0]; 008757 }else{ 008758 iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell; 008759 iNew = cntNew[iPg-1] + !leafData; 008760 nNewCell = cntNew[iPg] - iNew; 008761 } 008762 008763 rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b); 008764 if( rc ) goto balance_cleanup; 008765 abDone[iPg]++; 008766 apNew[iPg]->nFree = usableSpace-szNew[iPg]; 008767 assert( apNew[iPg]->nOverflow==0 ); 008768 assert( apNew[iPg]->nCell==nNewCell ); 008769 } 008770 } 008771 008772 /* All pages have been processed exactly once */ 008773 assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 ); 008774 008775 assert( nOld>0 ); 008776 assert( nNew>0 ); 008777 008778 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){ 008779 /* The root page of the b-tree now contains no cells. The only sibling 008780 ** page is the right-child of the parent. Copy the contents of the 008781 ** child page into the parent, decreasing the overall height of the 008782 ** b-tree structure by one. This is described as the "balance-shallower" 008783 ** sub-algorithm in some documentation. 008784 ** 008785 ** If this is an auto-vacuum database, the call to copyNodeContent() 008786 ** sets all pointer-map entries corresponding to database image pages 008787 ** for which the pointer is stored within the content being copied. 008788 ** 008789 ** It is critical that the child page be defragmented before being 008790 ** copied into the parent, because if the parent is page 1 then it will 008791 ** by smaller than the child due to the database header, and so all the 008792 ** free space needs to be up front. 008793 */ 008794 assert( nNew==1 || CORRUPT_DB ); 008795 rc = defragmentPage(apNew[0], -1); 008796 testcase( rc!=SQLITE_OK ); 008797 assert( apNew[0]->nFree == 008798 (get2byteNotZero(&apNew[0]->aData[5]) - apNew[0]->cellOffset 008799 - apNew[0]->nCell*2) 008800 || rc!=SQLITE_OK 008801 ); 008802 copyNodeContent(apNew[0], pParent, &rc); 008803 freePage(apNew[0], &rc); 008804 }else if( ISAUTOVACUUM(pBt) && !leafCorrection ){ 008805 /* Fix the pointer map entries associated with the right-child of each 008806 ** sibling page. All other pointer map entries have already been taken 008807 ** care of. */ 008808 for(i=0; i<nNew; i++){ 008809 u32 key = get4byte(&apNew[i]->aData[8]); 008810 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); 008811 } 008812 } 008813 008814 assert( pParent->isInit ); 008815 TRACE(("BALANCE: finished: old=%u new=%u cells=%u\n", 008816 nOld, nNew, b.nCell)); 008817 008818 /* Free any old pages that were not reused as new pages. 008819 */ 008820 for(i=nNew; i<nOld; i++){ 008821 freePage(apOld[i], &rc); 008822 } 008823 008824 #if 0 008825 if( ISAUTOVACUUM(pBt) && rc==SQLITE_OK && apNew[0]->isInit ){ 008826 /* The ptrmapCheckPages() contains assert() statements that verify that 008827 ** all pointer map pages are set correctly. This is helpful while 008828 ** debugging. This is usually disabled because a corrupt database may 008829 ** cause an assert() statement to fail. */ 008830 ptrmapCheckPages(apNew, nNew); 008831 ptrmapCheckPages(&pParent, 1); 008832 } 008833 #endif 008834 008835 /* 008836 ** Cleanup before returning. 008837 */ 008838 balance_cleanup: 008839 sqlite3StackFree(0, b.apCell); 008840 for(i=0; i<nOld; i++){ 008841 releasePage(apOld[i]); 008842 } 008843 for(i=0; i<nNew; i++){ 008844 releasePage(apNew[i]); 008845 } 008846 008847 return rc; 008848 } 008849 008850 008851 /* 008852 ** This function is called when the root page of a b-tree structure is 008853 ** overfull (has one or more overflow pages). 008854 ** 008855 ** A new child page is allocated and the contents of the current root 008856 ** page, including overflow cells, are copied into the child. The root 008857 ** page is then overwritten to make it an empty page with the right-child 008858 ** pointer pointing to the new page. 008859 ** 008860 ** Before returning, all pointer-map entries corresponding to pages 008861 ** that the new child-page now contains pointers to are updated. The 008862 ** entry corresponding to the new right-child pointer of the root 008863 ** page is also updated. 008864 ** 008865 ** If successful, *ppChild is set to contain a reference to the child 008866 ** page and SQLITE_OK is returned. In this case the caller is required 008867 ** to call releasePage() on *ppChild exactly once. If an error occurs, 008868 ** an error code is returned and *ppChild is set to 0. 008869 */ 008870 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){ 008871 int rc; /* Return value from subprocedures */ 008872 MemPage *pChild = 0; /* Pointer to a new child page */ 008873 Pgno pgnoChild = 0; /* Page number of the new child page */ 008874 BtShared *pBt = pRoot->pBt; /* The BTree */ 008875 008876 assert( pRoot->nOverflow>0 ); 008877 assert( sqlite3_mutex_held(pBt->mutex) ); 008878 008879 /* Make pRoot, the root page of the b-tree, writable. Allocate a new 008880 ** page that will become the new right-child of pPage. Copy the contents 008881 ** of the node stored on pRoot into the new child page. 008882 */ 008883 rc = sqlite3PagerWrite(pRoot->pDbPage); 008884 if( rc==SQLITE_OK ){ 008885 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0); 008886 copyNodeContent(pRoot, pChild, &rc); 008887 if( ISAUTOVACUUM(pBt) ){ 008888 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc); 008889 } 008890 } 008891 if( rc ){ 008892 *ppChild = 0; 008893 releasePage(pChild); 008894 return rc; 008895 } 008896 assert( sqlite3PagerIswriteable(pChild->pDbPage) ); 008897 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 008898 assert( pChild->nCell==pRoot->nCell || CORRUPT_DB ); 008899 008900 TRACE(("BALANCE: copy root %u into %u\n", pRoot->pgno, pChild->pgno)); 008901 008902 /* Copy the overflow cells from pRoot to pChild */ 008903 memcpy(pChild->aiOvfl, pRoot->aiOvfl, 008904 pRoot->nOverflow*sizeof(pRoot->aiOvfl[0])); 008905 memcpy(pChild->apOvfl, pRoot->apOvfl, 008906 pRoot->nOverflow*sizeof(pRoot->apOvfl[0])); 008907 pChild->nOverflow = pRoot->nOverflow; 008908 008909 /* Zero the contents of pRoot. Then install pChild as the right-child. */ 008910 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF); 008911 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild); 008912 008913 *ppChild = pChild; 008914 return SQLITE_OK; 008915 } 008916 008917 /* 008918 ** Return SQLITE_CORRUPT if any cursor other than pCur is currently valid 008919 ** on the same B-tree as pCur. 008920 ** 008921 ** This can occur if a database is corrupt with two or more SQL tables 008922 ** pointing to the same b-tree. If an insert occurs on one SQL table 008923 ** and causes a BEFORE TRIGGER to do a secondary insert on the other SQL 008924 ** table linked to the same b-tree. If the secondary insert causes a 008925 ** rebalance, that can change content out from under the cursor on the 008926 ** first SQL table, violating invariants on the first insert. 008927 */ 008928 static int anotherValidCursor(BtCursor *pCur){ 008929 BtCursor *pOther; 008930 for(pOther=pCur->pBt->pCursor; pOther; pOther=pOther->pNext){ 008931 if( pOther!=pCur 008932 && pOther->eState==CURSOR_VALID 008933 && pOther->pPage==pCur->pPage 008934 ){ 008935 return SQLITE_CORRUPT_BKPT; 008936 } 008937 } 008938 return SQLITE_OK; 008939 } 008940 008941 /* 008942 ** The page that pCur currently points to has just been modified in 008943 ** some way. This function figures out if this modification means the 008944 ** tree needs to be balanced, and if so calls the appropriate balancing 008945 ** routine. Balancing routines are: 008946 ** 008947 ** balance_quick() 008948 ** balance_deeper() 008949 ** balance_nonroot() 008950 */ 008951 static int balance(BtCursor *pCur){ 008952 int rc = SQLITE_OK; 008953 u8 aBalanceQuickSpace[13]; 008954 u8 *pFree = 0; 008955 008956 VVA_ONLY( int balance_quick_called = 0 ); 008957 VVA_ONLY( int balance_deeper_called = 0 ); 008958 008959 do { 008960 int iPage; 008961 MemPage *pPage = pCur->pPage; 008962 008963 if( NEVER(pPage->nFree<0) && btreeComputeFreeSpace(pPage) ) break; 008964 if( pPage->nOverflow==0 && pPage->nFree*3<=(int)pCur->pBt->usableSize*2 ){ 008965 /* No rebalance required as long as: 008966 ** (1) There are no overflow cells 008967 ** (2) The amount of free space on the page is less than 2/3rds of 008968 ** the total usable space on the page. */ 008969 break; 008970 }else if( (iPage = pCur->iPage)==0 ){ 008971 if( pPage->nOverflow && (rc = anotherValidCursor(pCur))==SQLITE_OK ){ 008972 /* The root page of the b-tree is overfull. In this case call the 008973 ** balance_deeper() function to create a new child for the root-page 008974 ** and copy the current contents of the root-page to it. The 008975 ** next iteration of the do-loop will balance the child page. 008976 */ 008977 assert( balance_deeper_called==0 ); 008978 VVA_ONLY( balance_deeper_called++ ); 008979 rc = balance_deeper(pPage, &pCur->apPage[1]); 008980 if( rc==SQLITE_OK ){ 008981 pCur->iPage = 1; 008982 pCur->ix = 0; 008983 pCur->aiIdx[0] = 0; 008984 pCur->apPage[0] = pPage; 008985 pCur->pPage = pCur->apPage[1]; 008986 assert( pCur->pPage->nOverflow ); 008987 } 008988 }else{ 008989 break; 008990 } 008991 }else if( sqlite3PagerPageRefcount(pPage->pDbPage)>1 ){ 008992 /* The page being written is not a root page, and there is currently 008993 ** more than one reference to it. This only happens if the page is one 008994 ** of its own ancestor pages. Corruption. */ 008995 rc = SQLITE_CORRUPT_BKPT; 008996 }else{ 008997 MemPage * const pParent = pCur->apPage[iPage-1]; 008998 int const iIdx = pCur->aiIdx[iPage-1]; 008999 009000 rc = sqlite3PagerWrite(pParent->pDbPage); 009001 if( rc==SQLITE_OK && pParent->nFree<0 ){ 009002 rc = btreeComputeFreeSpace(pParent); 009003 } 009004 if( rc==SQLITE_OK ){ 009005 #ifndef SQLITE_OMIT_QUICKBALANCE 009006 if( pPage->intKeyLeaf 009007 && pPage->nOverflow==1 009008 && pPage->aiOvfl[0]==pPage->nCell 009009 && pParent->pgno!=1 009010 && pParent->nCell==iIdx 009011 ){ 009012 /* Call balance_quick() to create a new sibling of pPage on which 009013 ** to store the overflow cell. balance_quick() inserts a new cell 009014 ** into pParent, which may cause pParent overflow. If this 009015 ** happens, the next iteration of the do-loop will balance pParent 009016 ** use either balance_nonroot() or balance_deeper(). Until this 009017 ** happens, the overflow cell is stored in the aBalanceQuickSpace[] 009018 ** buffer. 009019 ** 009020 ** The purpose of the following assert() is to check that only a 009021 ** single call to balance_quick() is made for each call to this 009022 ** function. If this were not verified, a subtle bug involving reuse 009023 ** of the aBalanceQuickSpace[] might sneak in. 009024 */ 009025 assert( balance_quick_called==0 ); 009026 VVA_ONLY( balance_quick_called++ ); 009027 rc = balance_quick(pParent, pPage, aBalanceQuickSpace); 009028 }else 009029 #endif 009030 { 009031 /* In this case, call balance_nonroot() to redistribute cells 009032 ** between pPage and up to 2 of its sibling pages. This involves 009033 ** modifying the contents of pParent, which may cause pParent to 009034 ** become overfull or underfull. The next iteration of the do-loop 009035 ** will balance the parent page to correct this. 009036 ** 009037 ** If the parent page becomes overfull, the overflow cell or cells 009038 ** are stored in the pSpace buffer allocated immediately below. 009039 ** A subsequent iteration of the do-loop will deal with this by 009040 ** calling balance_nonroot() (balance_deeper() may be called first, 009041 ** but it doesn't deal with overflow cells - just moves them to a 009042 ** different page). Once this subsequent call to balance_nonroot() 009043 ** has completed, it is safe to release the pSpace buffer used by 009044 ** the previous call, as the overflow cell data will have been 009045 ** copied either into the body of a database page or into the new 009046 ** pSpace buffer passed to the latter call to balance_nonroot(). 009047 */ 009048 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize); 009049 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1, 009050 pCur->hints&BTREE_BULKLOAD); 009051 if( pFree ){ 009052 /* If pFree is not NULL, it points to the pSpace buffer used 009053 ** by a previous call to balance_nonroot(). Its contents are 009054 ** now stored either on real database pages or within the 009055 ** new pSpace buffer, so it may be safely freed here. */ 009056 sqlite3PageFree(pFree); 009057 } 009058 009059 /* The pSpace buffer will be freed after the next call to 009060 ** balance_nonroot(), or just before this function returns, whichever 009061 ** comes first. */ 009062 pFree = pSpace; 009063 } 009064 } 009065 009066 pPage->nOverflow = 0; 009067 009068 /* The next iteration of the do-loop balances the parent page. */ 009069 releasePage(pPage); 009070 pCur->iPage--; 009071 assert( pCur->iPage>=0 ); 009072 pCur->pPage = pCur->apPage[pCur->iPage]; 009073 } 009074 }while( rc==SQLITE_OK ); 009075 009076 if( pFree ){ 009077 sqlite3PageFree(pFree); 009078 } 009079 return rc; 009080 } 009081 009082 /* Overwrite content from pX into pDest. Only do the write if the 009083 ** content is different from what is already there. 009084 */ 009085 static int btreeOverwriteContent( 009086 MemPage *pPage, /* MemPage on which writing will occur */ 009087 u8 *pDest, /* Pointer to the place to start writing */ 009088 const BtreePayload *pX, /* Source of data to write */ 009089 int iOffset, /* Offset of first byte to write */ 009090 int iAmt /* Number of bytes to be written */ 009091 ){ 009092 int nData = pX->nData - iOffset; 009093 if( nData<=0 ){ 009094 /* Overwriting with zeros */ 009095 int i; 009096 for(i=0; i<iAmt && pDest[i]==0; i++){} 009097 if( i<iAmt ){ 009098 int rc = sqlite3PagerWrite(pPage->pDbPage); 009099 if( rc ) return rc; 009100 memset(pDest + i, 0, iAmt - i); 009101 } 009102 }else{ 009103 if( nData<iAmt ){ 009104 /* Mixed read data and zeros at the end. Make a recursive call 009105 ** to write the zeros then fall through to write the real data */ 009106 int rc = btreeOverwriteContent(pPage, pDest+nData, pX, iOffset+nData, 009107 iAmt-nData); 009108 if( rc ) return rc; 009109 iAmt = nData; 009110 } 009111 if( memcmp(pDest, ((u8*)pX->pData) + iOffset, iAmt)!=0 ){ 009112 int rc = sqlite3PagerWrite(pPage->pDbPage); 009113 if( rc ) return rc; 009114 /* In a corrupt database, it is possible for the source and destination 009115 ** buffers to overlap. This is harmless since the database is already 009116 ** corrupt but it does cause valgrind and ASAN warnings. So use 009117 ** memmove(). */ 009118 memmove(pDest, ((u8*)pX->pData) + iOffset, iAmt); 009119 } 009120 } 009121 return SQLITE_OK; 009122 } 009123 009124 /* 009125 ** Overwrite the cell that cursor pCur is pointing to with fresh content 009126 ** contained in pX. In this variant, pCur is pointing to an overflow 009127 ** cell. 009128 */ 009129 static SQLITE_NOINLINE int btreeOverwriteOverflowCell( 009130 BtCursor *pCur, /* Cursor pointing to cell to overwrite */ 009131 const BtreePayload *pX /* Content to write into the cell */ 009132 ){ 009133 int iOffset; /* Next byte of pX->pData to write */ 009134 int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */ 009135 int rc; /* Return code */ 009136 MemPage *pPage = pCur->pPage; /* Page being written */ 009137 BtShared *pBt; /* Btree */ 009138 Pgno ovflPgno; /* Next overflow page to write */ 009139 u32 ovflPageSize; /* Size to write on overflow page */ 009140 009141 assert( pCur->info.nLocal<nTotal ); /* pCur is an overflow cell */ 009142 009143 /* Overwrite the local portion first */ 009144 rc = btreeOverwriteContent(pPage, pCur->info.pPayload, pX, 009145 0, pCur->info.nLocal); 009146 if( rc ) return rc; 009147 009148 /* Now overwrite the overflow pages */ 009149 iOffset = pCur->info.nLocal; 009150 assert( nTotal>=0 ); 009151 assert( iOffset>=0 ); 009152 ovflPgno = get4byte(pCur->info.pPayload + iOffset); 009153 pBt = pPage->pBt; 009154 ovflPageSize = pBt->usableSize - 4; 009155 do{ 009156 rc = btreeGetPage(pBt, ovflPgno, &pPage, 0); 009157 if( rc ) return rc; 009158 if( sqlite3PagerPageRefcount(pPage->pDbPage)!=1 || pPage->isInit ){ 009159 rc = SQLITE_CORRUPT_BKPT; 009160 }else{ 009161 if( iOffset+ovflPageSize<(u32)nTotal ){ 009162 ovflPgno = get4byte(pPage->aData); 009163 }else{ 009164 ovflPageSize = nTotal - iOffset; 009165 } 009166 rc = btreeOverwriteContent(pPage, pPage->aData+4, pX, 009167 iOffset, ovflPageSize); 009168 } 009169 sqlite3PagerUnref(pPage->pDbPage); 009170 if( rc ) return rc; 009171 iOffset += ovflPageSize; 009172 }while( iOffset<nTotal ); 009173 return SQLITE_OK; 009174 } 009175 009176 /* 009177 ** Overwrite the cell that cursor pCur is pointing to with fresh content 009178 ** contained in pX. 009179 */ 009180 static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){ 009181 int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */ 009182 MemPage *pPage = pCur->pPage; /* Page being written */ 009183 009184 if( pCur->info.pPayload + pCur->info.nLocal > pPage->aDataEnd 009185 || pCur->info.pPayload < pPage->aData + pPage->cellOffset 009186 ){ 009187 return SQLITE_CORRUPT_BKPT; 009188 } 009189 if( pCur->info.nLocal==nTotal ){ 009190 /* The entire cell is local */ 009191 return btreeOverwriteContent(pPage, pCur->info.pPayload, pX, 009192 0, pCur->info.nLocal); 009193 }else{ 009194 /* The cell contains overflow content */ 009195 return btreeOverwriteOverflowCell(pCur, pX); 009196 } 009197 } 009198 009199 009200 /* 009201 ** Insert a new record into the BTree. The content of the new record 009202 ** is described by the pX object. The pCur cursor is used only to 009203 ** define what table the record should be inserted into, and is left 009204 ** pointing at a random location. 009205 ** 009206 ** For a table btree (used for rowid tables), only the pX.nKey value of 009207 ** the key is used. The pX.pKey value must be NULL. The pX.nKey is the 009208 ** rowid or INTEGER PRIMARY KEY of the row. The pX.nData,pData,nZero fields 009209 ** hold the content of the row. 009210 ** 009211 ** For an index btree (used for indexes and WITHOUT ROWID tables), the 009212 ** key is an arbitrary byte sequence stored in pX.pKey,nKey. The 009213 ** pX.pData,nData,nZero fields must be zero. 009214 ** 009215 ** If the seekResult parameter is non-zero, then a successful call to 009216 ** sqlite3BtreeIndexMoveto() to seek cursor pCur to (pKey,nKey) has already 009217 ** been performed. In other words, if seekResult!=0 then the cursor 009218 ** is currently pointing to a cell that will be adjacent to the cell 009219 ** to be inserted. If seekResult<0 then pCur points to a cell that is 009220 ** smaller then (pKey,nKey). If seekResult>0 then pCur points to a cell 009221 ** that is larger than (pKey,nKey). 009222 ** 009223 ** If seekResult==0, that means pCur is pointing at some unknown location. 009224 ** In that case, this routine must seek the cursor to the correct insertion 009225 ** point for (pKey,nKey) before doing the insertion. For index btrees, 009226 ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked 009227 ** key values and pX->aMem can be used instead of pX->pKey to avoid having 009228 ** to decode the key. 009229 */ 009230 int sqlite3BtreeInsert( 009231 BtCursor *pCur, /* Insert data into the table of this cursor */ 009232 const BtreePayload *pX, /* Content of the row to be inserted */ 009233 int flags, /* True if this is likely an append */ 009234 int seekResult /* Result of prior IndexMoveto() call */ 009235 ){ 009236 int rc; 009237 int loc = seekResult; /* -1: before desired location +1: after */ 009238 int szNew = 0; 009239 int idx; 009240 MemPage *pPage; 009241 Btree *p = pCur->pBtree; 009242 unsigned char *oldCell; 009243 unsigned char *newCell = 0; 009244 009245 assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND|BTREE_PREFORMAT))==flags ); 009246 assert( (flags & BTREE_PREFORMAT)==0 || seekResult || pCur->pKeyInfo==0 ); 009247 009248 /* Save the positions of any other cursors open on this table. 009249 ** 009250 ** In some cases, the call to btreeMoveto() below is a no-op. For 009251 ** example, when inserting data into a table with auto-generated integer 009252 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the 009253 ** integer key to use. It then calls this function to actually insert the 009254 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes 009255 ** that the cursor is already where it needs to be and returns without 009256 ** doing any work. To avoid thwarting these optimizations, it is important 009257 ** not to clear the cursor here. 009258 */ 009259 if( pCur->curFlags & BTCF_Multiple ){ 009260 rc = saveAllCursors(p->pBt, pCur->pgnoRoot, pCur); 009261 if( rc ) return rc; 009262 if( loc && pCur->iPage<0 ){ 009263 /* This can only happen if the schema is corrupt such that there is more 009264 ** than one table or index with the same root page as used by the cursor. 009265 ** Which can only happen if the SQLITE_NoSchemaError flag was set when 009266 ** the schema was loaded. This cannot be asserted though, as a user might 009267 ** set the flag, load the schema, and then unset the flag. */ 009268 return SQLITE_CORRUPT_BKPT; 009269 } 009270 } 009271 009272 /* Ensure that the cursor is not in the CURSOR_FAULT state and that it 009273 ** points to a valid cell. 009274 */ 009275 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 009276 testcase( pCur->eState==CURSOR_REQUIRESEEK ); 009277 testcase( pCur->eState==CURSOR_FAULT ); 009278 rc = moveToRoot(pCur); 009279 if( rc && rc!=SQLITE_EMPTY ) return rc; 009280 } 009281 009282 assert( cursorOwnsBtShared(pCur) ); 009283 assert( (pCur->curFlags & BTCF_WriteFlag)!=0 009284 && p->pBt->inTransaction==TRANS_WRITE 009285 && (p->pBt->btsFlags & BTS_READ_ONLY)==0 ); 009286 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 009287 009288 /* Assert that the caller has been consistent. If this cursor was opened 009289 ** expecting an index b-tree, then the caller should be inserting blob 009290 ** keys with no associated data. If the cursor was opened expecting an 009291 ** intkey table, the caller should be inserting integer keys with a 009292 ** blob of associated data. */ 009293 assert( (flags & BTREE_PREFORMAT) || (pX->pKey==0)==(pCur->pKeyInfo==0) ); 009294 009295 if( pCur->pKeyInfo==0 ){ 009296 assert( pX->pKey==0 ); 009297 /* If this is an insert into a table b-tree, invalidate any incrblob 009298 ** cursors open on the row being replaced */ 009299 if( p->hasIncrblobCur ){ 009300 invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0); 009301 } 009302 009303 /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing 009304 ** to a row with the same key as the new entry being inserted. 009305 */ 009306 #ifdef SQLITE_DEBUG 009307 if( flags & BTREE_SAVEPOSITION ){ 009308 assert( pCur->curFlags & BTCF_ValidNKey ); 009309 assert( pX->nKey==pCur->info.nKey ); 009310 assert( loc==0 ); 009311 } 009312 #endif 009313 009314 /* On the other hand, BTREE_SAVEPOSITION==0 does not imply 009315 ** that the cursor is not pointing to a row to be overwritten. 009316 ** So do a complete check. 009317 */ 009318 if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){ 009319 /* The cursor is pointing to the entry that is to be 009320 ** overwritten */ 009321 assert( pX->nData>=0 && pX->nZero>=0 ); 009322 if( pCur->info.nSize!=0 009323 && pCur->info.nPayload==(u32)pX->nData+pX->nZero 009324 ){ 009325 /* New entry is the same size as the old. Do an overwrite */ 009326 return btreeOverwriteCell(pCur, pX); 009327 } 009328 assert( loc==0 ); 009329 }else if( loc==0 ){ 009330 /* The cursor is *not* pointing to the cell to be overwritten, nor 009331 ** to an adjacent cell. Move the cursor so that it is pointing either 009332 ** to the cell to be overwritten or an adjacent cell. 009333 */ 009334 rc = sqlite3BtreeTableMoveto(pCur, pX->nKey, 009335 (flags & BTREE_APPEND)!=0, &loc); 009336 if( rc ) return rc; 009337 } 009338 }else{ 009339 /* This is an index or a WITHOUT ROWID table */ 009340 009341 /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing 009342 ** to a row with the same key as the new entry being inserted. 009343 */ 009344 assert( (flags & BTREE_SAVEPOSITION)==0 || loc==0 ); 009345 009346 /* If the cursor is not already pointing either to the cell to be 009347 ** overwritten, or if a new cell is being inserted, if the cursor is 009348 ** not pointing to an immediately adjacent cell, then move the cursor 009349 ** so that it does. 009350 */ 009351 if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){ 009352 if( pX->nMem ){ 009353 UnpackedRecord r; 009354 r.pKeyInfo = pCur->pKeyInfo; 009355 r.aMem = pX->aMem; 009356 r.nField = pX->nMem; 009357 r.default_rc = 0; 009358 r.eqSeen = 0; 009359 rc = sqlite3BtreeIndexMoveto(pCur, &r, &loc); 009360 }else{ 009361 rc = btreeMoveto(pCur, pX->pKey, pX->nKey, 009362 (flags & BTREE_APPEND)!=0, &loc); 009363 } 009364 if( rc ) return rc; 009365 } 009366 009367 /* If the cursor is currently pointing to an entry to be overwritten 009368 ** and the new content is the same as as the old, then use the 009369 ** overwrite optimization. 009370 */ 009371 if( loc==0 ){ 009372 getCellInfo(pCur); 009373 if( pCur->info.nKey==pX->nKey ){ 009374 BtreePayload x2; 009375 x2.pData = pX->pKey; 009376 x2.nData = pX->nKey; 009377 x2.nZero = 0; 009378 return btreeOverwriteCell(pCur, &x2); 009379 } 009380 } 009381 } 009382 assert( pCur->eState==CURSOR_VALID 009383 || (pCur->eState==CURSOR_INVALID && loc) || CORRUPT_DB ); 009384 009385 pPage = pCur->pPage; 009386 assert( pPage->intKey || pX->nKey>=0 || (flags & BTREE_PREFORMAT) ); 009387 assert( pPage->leaf || !pPage->intKey ); 009388 if( pPage->nFree<0 ){ 009389 if( NEVER(pCur->eState>CURSOR_INVALID) ){ 009390 /* ^^^^^--- due to the moveToRoot() call above */ 009391 rc = SQLITE_CORRUPT_BKPT; 009392 }else{ 009393 rc = btreeComputeFreeSpace(pPage); 009394 } 009395 if( rc ) return rc; 009396 } 009397 009398 TRACE(("INSERT: table=%u nkey=%lld ndata=%u page=%u %s\n", 009399 pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno, 009400 loc==0 ? "overwrite" : "new entry")); 009401 assert( pPage->isInit || CORRUPT_DB ); 009402 newCell = p->pBt->pTmpSpace; 009403 assert( newCell!=0 ); 009404 assert( BTREE_PREFORMAT==OPFLAG_PREFORMAT ); 009405 if( flags & BTREE_PREFORMAT ){ 009406 rc = SQLITE_OK; 009407 szNew = p->pBt->nPreformatSize; 009408 if( szNew<4 ){ 009409 szNew = 4; 009410 newCell[3] = 0; 009411 } 009412 if( ISAUTOVACUUM(p->pBt) && szNew>pPage->maxLocal ){ 009413 CellInfo info; 009414 pPage->xParseCell(pPage, newCell, &info); 009415 if( info.nPayload!=info.nLocal ){ 009416 Pgno ovfl = get4byte(&newCell[szNew-4]); 009417 ptrmapPut(p->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, &rc); 009418 if( NEVER(rc) ) goto end_insert; 009419 } 009420 } 009421 }else{ 009422 rc = fillInCell(pPage, newCell, pX, &szNew); 009423 if( rc ) goto end_insert; 009424 } 009425 assert( szNew==pPage->xCellSize(pPage, newCell) ); 009426 assert( szNew <= MX_CELL_SIZE(p->pBt) ); 009427 idx = pCur->ix; 009428 pCur->info.nSize = 0; 009429 if( loc==0 ){ 009430 CellInfo info; 009431 assert( idx>=0 ); 009432 if( idx>=pPage->nCell ){ 009433 return SQLITE_CORRUPT_BKPT; 009434 } 009435 rc = sqlite3PagerWrite(pPage->pDbPage); 009436 if( rc ){ 009437 goto end_insert; 009438 } 009439 oldCell = findCell(pPage, idx); 009440 if( !pPage->leaf ){ 009441 memcpy(newCell, oldCell, 4); 009442 } 009443 BTREE_CLEAR_CELL(rc, pPage, oldCell, info); 009444 testcase( pCur->curFlags & BTCF_ValidOvfl ); 009445 invalidateOverflowCache(pCur); 009446 if( info.nSize==szNew && info.nLocal==info.nPayload 009447 && (!ISAUTOVACUUM(p->pBt) || szNew<pPage->minLocal) 009448 ){ 009449 /* Overwrite the old cell with the new if they are the same size. 009450 ** We could also try to do this if the old cell is smaller, then add 009451 ** the leftover space to the free list. But experiments show that 009452 ** doing that is no faster then skipping this optimization and just 009453 ** calling dropCell() and insertCell(). 009454 ** 009455 ** This optimization cannot be used on an autovacuum database if the 009456 ** new entry uses overflow pages, as the insertCell() call below is 009457 ** necessary to add the PTRMAP_OVERFLOW1 pointer-map entry. */ 009458 assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */ 009459 if( oldCell < pPage->aData+pPage->hdrOffset+10 ){ 009460 return SQLITE_CORRUPT_BKPT; 009461 } 009462 if( oldCell+szNew > pPage->aDataEnd ){ 009463 return SQLITE_CORRUPT_BKPT; 009464 } 009465 memcpy(oldCell, newCell, szNew); 009466 return SQLITE_OK; 009467 } 009468 dropCell(pPage, idx, info.nSize, &rc); 009469 if( rc ) goto end_insert; 009470 }else if( loc<0 && pPage->nCell>0 ){ 009471 assert( pPage->leaf ); 009472 idx = ++pCur->ix; 009473 pCur->curFlags &= ~BTCF_ValidNKey; 009474 }else{ 009475 assert( pPage->leaf ); 009476 } 009477 rc = insertCellFast(pPage, idx, newCell, szNew); 009478 assert( pPage->nOverflow==0 || rc==SQLITE_OK ); 009479 assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 ); 009480 009481 /* If no error has occurred and pPage has an overflow cell, call balance() 009482 ** to redistribute the cells within the tree. Since balance() may move 009483 ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey 009484 ** variables. 009485 ** 009486 ** Previous versions of SQLite called moveToRoot() to move the cursor 009487 ** back to the root page as balance() used to invalidate the contents 009488 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that, 009489 ** set the cursor state to "invalid". This makes common insert operations 009490 ** slightly faster. 009491 ** 009492 ** There is a subtle but important optimization here too. When inserting 009493 ** multiple records into an intkey b-tree using a single cursor (as can 009494 ** happen while processing an "INSERT INTO ... SELECT" statement), it 009495 ** is advantageous to leave the cursor pointing to the last entry in 009496 ** the b-tree if possible. If the cursor is left pointing to the last 009497 ** entry in the table, and the next row inserted has an integer key 009498 ** larger than the largest existing key, it is possible to insert the 009499 ** row without seeking the cursor. This can be a big performance boost. 009500 */ 009501 if( pPage->nOverflow ){ 009502 assert( rc==SQLITE_OK ); 009503 pCur->curFlags &= ~(BTCF_ValidNKey); 009504 rc = balance(pCur); 009505 009506 /* Must make sure nOverflow is reset to zero even if the balance() 009507 ** fails. Internal data structure corruption will result otherwise. 009508 ** Also, set the cursor state to invalid. This stops saveCursorPosition() 009509 ** from trying to save the current position of the cursor. */ 009510 pCur->pPage->nOverflow = 0; 009511 pCur->eState = CURSOR_INVALID; 009512 if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){ 009513 btreeReleaseAllCursorPages(pCur); 009514 if( pCur->pKeyInfo ){ 009515 assert( pCur->pKey==0 ); 009516 pCur->pKey = sqlite3Malloc( pX->nKey ); 009517 if( pCur->pKey==0 ){ 009518 rc = SQLITE_NOMEM; 009519 }else{ 009520 memcpy(pCur->pKey, pX->pKey, pX->nKey); 009521 } 009522 } 009523 pCur->eState = CURSOR_REQUIRESEEK; 009524 pCur->nKey = pX->nKey; 009525 } 009526 } 009527 assert( pCur->iPage<0 || pCur->pPage->nOverflow==0 ); 009528 009529 end_insert: 009530 return rc; 009531 } 009532 009533 /* 009534 ** This function is used as part of copying the current row from cursor 009535 ** pSrc into cursor pDest. If the cursors are open on intkey tables, then 009536 ** parameter iKey is used as the rowid value when the record is copied 009537 ** into pDest. Otherwise, the record is copied verbatim. 009538 ** 009539 ** This function does not actually write the new value to cursor pDest. 009540 ** Instead, it creates and populates any required overflow pages and 009541 ** writes the data for the new cell into the BtShared.pTmpSpace buffer 009542 ** for the destination database. The size of the cell, in bytes, is left 009543 ** in BtShared.nPreformatSize. The caller completes the insertion by 009544 ** calling sqlite3BtreeInsert() with the BTREE_PREFORMAT flag specified. 009545 ** 009546 ** SQLITE_OK is returned if successful, or an SQLite error code otherwise. 009547 */ 009548 int sqlite3BtreeTransferRow(BtCursor *pDest, BtCursor *pSrc, i64 iKey){ 009549 BtShared *pBt = pDest->pBt; 009550 u8 *aOut = pBt->pTmpSpace; /* Pointer to next output buffer */ 009551 const u8 *aIn; /* Pointer to next input buffer */ 009552 u32 nIn; /* Size of input buffer aIn[] */ 009553 u32 nRem; /* Bytes of data still to copy */ 009554 009555 getCellInfo(pSrc); 009556 if( pSrc->info.nPayload<0x80 ){ 009557 *(aOut++) = pSrc->info.nPayload; 009558 }else{ 009559 aOut += sqlite3PutVarint(aOut, pSrc->info.nPayload); 009560 } 009561 if( pDest->pKeyInfo==0 ) aOut += putVarint(aOut, iKey); 009562 nIn = pSrc->info.nLocal; 009563 aIn = pSrc->info.pPayload; 009564 if( aIn+nIn>pSrc->pPage->aDataEnd ){ 009565 return SQLITE_CORRUPT_BKPT; 009566 } 009567 nRem = pSrc->info.nPayload; 009568 if( nIn==nRem && nIn<pDest->pPage->maxLocal ){ 009569 memcpy(aOut, aIn, nIn); 009570 pBt->nPreformatSize = nIn + (aOut - pBt->pTmpSpace); 009571 return SQLITE_OK; 009572 }else{ 009573 int rc = SQLITE_OK; 009574 Pager *pSrcPager = pSrc->pBt->pPager; 009575 u8 *pPgnoOut = 0; 009576 Pgno ovflIn = 0; 009577 DbPage *pPageIn = 0; 009578 MemPage *pPageOut = 0; 009579 u32 nOut; /* Size of output buffer aOut[] */ 009580 009581 nOut = btreePayloadToLocal(pDest->pPage, pSrc->info.nPayload); 009582 pBt->nPreformatSize = nOut + (aOut - pBt->pTmpSpace); 009583 if( nOut<pSrc->info.nPayload ){ 009584 pPgnoOut = &aOut[nOut]; 009585 pBt->nPreformatSize += 4; 009586 } 009587 009588 if( nRem>nIn ){ 009589 if( aIn+nIn+4>pSrc->pPage->aDataEnd ){ 009590 return SQLITE_CORRUPT_BKPT; 009591 } 009592 ovflIn = get4byte(&pSrc->info.pPayload[nIn]); 009593 } 009594 009595 do { 009596 nRem -= nOut; 009597 do{ 009598 assert( nOut>0 ); 009599 if( nIn>0 ){ 009600 int nCopy = MIN(nOut, nIn); 009601 memcpy(aOut, aIn, nCopy); 009602 nOut -= nCopy; 009603 nIn -= nCopy; 009604 aOut += nCopy; 009605 aIn += nCopy; 009606 } 009607 if( nOut>0 ){ 009608 sqlite3PagerUnref(pPageIn); 009609 pPageIn = 0; 009610 rc = sqlite3PagerGet(pSrcPager, ovflIn, &pPageIn, PAGER_GET_READONLY); 009611 if( rc==SQLITE_OK ){ 009612 aIn = (const u8*)sqlite3PagerGetData(pPageIn); 009613 ovflIn = get4byte(aIn); 009614 aIn += 4; 009615 nIn = pSrc->pBt->usableSize - 4; 009616 } 009617 } 009618 }while( rc==SQLITE_OK && nOut>0 ); 009619 009620 if( rc==SQLITE_OK && nRem>0 && ALWAYS(pPgnoOut) ){ 009621 Pgno pgnoNew; 009622 MemPage *pNew = 0; 009623 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 009624 put4byte(pPgnoOut, pgnoNew); 009625 if( ISAUTOVACUUM(pBt) && pPageOut ){ 009626 ptrmapPut(pBt, pgnoNew, PTRMAP_OVERFLOW2, pPageOut->pgno, &rc); 009627 } 009628 releasePage(pPageOut); 009629 pPageOut = pNew; 009630 if( pPageOut ){ 009631 pPgnoOut = pPageOut->aData; 009632 put4byte(pPgnoOut, 0); 009633 aOut = &pPgnoOut[4]; 009634 nOut = MIN(pBt->usableSize - 4, nRem); 009635 } 009636 } 009637 }while( nRem>0 && rc==SQLITE_OK ); 009638 009639 releasePage(pPageOut); 009640 sqlite3PagerUnref(pPageIn); 009641 return rc; 009642 } 009643 } 009644 009645 /* 009646 ** Delete the entry that the cursor is pointing to. 009647 ** 009648 ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then 009649 ** the cursor is left pointing at an arbitrary location after the delete. 009650 ** But if that bit is set, then the cursor is left in a state such that 009651 ** the next call to BtreeNext() or BtreePrev() moves it to the same row 009652 ** as it would have been on if the call to BtreeDelete() had been omitted. 009653 ** 009654 ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes 009655 ** associated with a single table entry and its indexes. Only one of those 009656 ** deletes is considered the "primary" delete. The primary delete occurs 009657 ** on a cursor that is not a BTREE_FORDELETE cursor. All but one delete 009658 ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag. 009659 ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation, 009660 ** but which might be used by alternative storage engines. 009661 */ 009662 int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){ 009663 Btree *p = pCur->pBtree; 009664 BtShared *pBt = p->pBt; 009665 int rc; /* Return code */ 009666 MemPage *pPage; /* Page to delete cell from */ 009667 unsigned char *pCell; /* Pointer to cell to delete */ 009668 int iCellIdx; /* Index of cell to delete */ 009669 int iCellDepth; /* Depth of node containing pCell */ 009670 CellInfo info; /* Size of the cell being deleted */ 009671 u8 bPreserve; /* Keep cursor valid. 2 for CURSOR_SKIPNEXT */ 009672 009673 assert( cursorOwnsBtShared(pCur) ); 009674 assert( pBt->inTransaction==TRANS_WRITE ); 009675 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 009676 assert( pCur->curFlags & BTCF_WriteFlag ); 009677 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 009678 assert( !hasReadConflicts(p, pCur->pgnoRoot) ); 009679 assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 ); 009680 if( pCur->eState!=CURSOR_VALID ){ 009681 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 009682 rc = btreeRestoreCursorPosition(pCur); 009683 assert( rc!=SQLITE_OK || CORRUPT_DB || pCur->eState==CURSOR_VALID ); 009684 if( rc || pCur->eState!=CURSOR_VALID ) return rc; 009685 }else{ 009686 return SQLITE_CORRUPT_BKPT; 009687 } 009688 } 009689 assert( pCur->eState==CURSOR_VALID ); 009690 009691 iCellDepth = pCur->iPage; 009692 iCellIdx = pCur->ix; 009693 pPage = pCur->pPage; 009694 if( pPage->nCell<=iCellIdx ){ 009695 return SQLITE_CORRUPT_BKPT; 009696 } 009697 pCell = findCell(pPage, iCellIdx); 009698 if( pPage->nFree<0 && btreeComputeFreeSpace(pPage) ){ 009699 return SQLITE_CORRUPT_BKPT; 009700 } 009701 if( pCell<&pPage->aCellIdx[pPage->nCell] ){ 009702 return SQLITE_CORRUPT_BKPT; 009703 } 009704 009705 /* If the BTREE_SAVEPOSITION bit is on, then the cursor position must 009706 ** be preserved following this delete operation. If the current delete 009707 ** will cause a b-tree rebalance, then this is done by saving the cursor 009708 ** key and leaving the cursor in CURSOR_REQUIRESEEK state before 009709 ** returning. 009710 ** 009711 ** If the current delete will not cause a rebalance, then the cursor 009712 ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately 009713 ** before or after the deleted entry. 009714 ** 009715 ** The bPreserve value records which path is required: 009716 ** 009717 ** bPreserve==0 Not necessary to save the cursor position 009718 ** bPreserve==1 Use CURSOR_REQUIRESEEK to save the cursor position 009719 ** bPreserve==2 Cursor won't move. Set CURSOR_SKIPNEXT. 009720 */ 009721 bPreserve = (flags & BTREE_SAVEPOSITION)!=0; 009722 if( bPreserve ){ 009723 if( !pPage->leaf 009724 || (pPage->nFree+pPage->xCellSize(pPage,pCell)+2) > 009725 (int)(pBt->usableSize*2/3) 009726 || pPage->nCell==1 /* See dbfuzz001.test for a test case */ 009727 ){ 009728 /* A b-tree rebalance will be required after deleting this entry. 009729 ** Save the cursor key. */ 009730 rc = saveCursorKey(pCur); 009731 if( rc ) return rc; 009732 }else{ 009733 bPreserve = 2; 009734 } 009735 } 009736 009737 /* If the page containing the entry to delete is not a leaf page, move 009738 ** the cursor to the largest entry in the tree that is smaller than 009739 ** the entry being deleted. This cell will replace the cell being deleted 009740 ** from the internal node. The 'previous' entry is used for this instead 009741 ** of the 'next' entry, as the previous entry is always a part of the 009742 ** sub-tree headed by the child page of the cell being deleted. This makes 009743 ** balancing the tree following the delete operation easier. */ 009744 if( !pPage->leaf ){ 009745 rc = sqlite3BtreePrevious(pCur, 0); 009746 assert( rc!=SQLITE_DONE ); 009747 if( rc ) return rc; 009748 } 009749 009750 /* Save the positions of any other cursors open on this table before 009751 ** making any modifications. */ 009752 if( pCur->curFlags & BTCF_Multiple ){ 009753 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); 009754 if( rc ) return rc; 009755 } 009756 009757 /* If this is a delete operation to remove a row from a table b-tree, 009758 ** invalidate any incrblob cursors open on the row being deleted. */ 009759 if( pCur->pKeyInfo==0 && p->hasIncrblobCur ){ 009760 invalidateIncrblobCursors(p, pCur->pgnoRoot, pCur->info.nKey, 0); 009761 } 009762 009763 /* Make the page containing the entry to be deleted writable. Then free any 009764 ** overflow pages associated with the entry and finally remove the cell 009765 ** itself from within the page. */ 009766 rc = sqlite3PagerWrite(pPage->pDbPage); 009767 if( rc ) return rc; 009768 BTREE_CLEAR_CELL(rc, pPage, pCell, info); 009769 dropCell(pPage, iCellIdx, info.nSize, &rc); 009770 if( rc ) return rc; 009771 009772 /* If the cell deleted was not located on a leaf page, then the cursor 009773 ** is currently pointing to the largest entry in the sub-tree headed 009774 ** by the child-page of the cell that was just deleted from an internal 009775 ** node. The cell from the leaf node needs to be moved to the internal 009776 ** node to replace the deleted cell. */ 009777 if( !pPage->leaf ){ 009778 MemPage *pLeaf = pCur->pPage; 009779 int nCell; 009780 Pgno n; 009781 unsigned char *pTmp; 009782 009783 if( pLeaf->nFree<0 ){ 009784 rc = btreeComputeFreeSpace(pLeaf); 009785 if( rc ) return rc; 009786 } 009787 if( iCellDepth<pCur->iPage-1 ){ 009788 n = pCur->apPage[iCellDepth+1]->pgno; 009789 }else{ 009790 n = pCur->pPage->pgno; 009791 } 009792 pCell = findCell(pLeaf, pLeaf->nCell-1); 009793 if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT; 009794 nCell = pLeaf->xCellSize(pLeaf, pCell); 009795 assert( MX_CELL_SIZE(pBt) >= nCell ); 009796 pTmp = pBt->pTmpSpace; 009797 assert( pTmp!=0 ); 009798 rc = sqlite3PagerWrite(pLeaf->pDbPage); 009799 if( rc==SQLITE_OK ){ 009800 rc = insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n); 009801 } 009802 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc); 009803 if( rc ) return rc; 009804 } 009805 009806 /* Balance the tree. If the entry deleted was located on a leaf page, 009807 ** then the cursor still points to that page. In this case the first 009808 ** call to balance() repairs the tree, and the if(...) condition is 009809 ** never true. 009810 ** 009811 ** Otherwise, if the entry deleted was on an internal node page, then 009812 ** pCur is pointing to the leaf page from which a cell was removed to 009813 ** replace the cell deleted from the internal node. This is slightly 009814 ** tricky as the leaf node may be underfull, and the internal node may 009815 ** be either under or overfull. In this case run the balancing algorithm 009816 ** on the leaf node first. If the balance proceeds far enough up the 009817 ** tree that we can be sure that any problem in the internal node has 009818 ** been corrected, so be it. Otherwise, after balancing the leaf node, 009819 ** walk the cursor up the tree to the internal node and balance it as 009820 ** well. */ 009821 assert( pCur->pPage->nOverflow==0 ); 009822 assert( pCur->pPage->nFree>=0 ); 009823 if( pCur->pPage->nFree*3<=(int)pCur->pBt->usableSize*2 ){ 009824 /* Optimization: If the free space is less than 2/3rds of the page, 009825 ** then balance() will always be a no-op. No need to invoke it. */ 009826 rc = SQLITE_OK; 009827 }else{ 009828 rc = balance(pCur); 009829 } 009830 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){ 009831 releasePageNotNull(pCur->pPage); 009832 pCur->iPage--; 009833 while( pCur->iPage>iCellDepth ){ 009834 releasePage(pCur->apPage[pCur->iPage--]); 009835 } 009836 pCur->pPage = pCur->apPage[pCur->iPage]; 009837 rc = balance(pCur); 009838 } 009839 009840 if( rc==SQLITE_OK ){ 009841 if( bPreserve>1 ){ 009842 assert( (pCur->iPage==iCellDepth || CORRUPT_DB) ); 009843 assert( pPage==pCur->pPage || CORRUPT_DB ); 009844 assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell ); 009845 pCur->eState = CURSOR_SKIPNEXT; 009846 if( iCellIdx>=pPage->nCell ){ 009847 pCur->skipNext = -1; 009848 pCur->ix = pPage->nCell-1; 009849 }else{ 009850 pCur->skipNext = 1; 009851 } 009852 }else{ 009853 rc = moveToRoot(pCur); 009854 if( bPreserve ){ 009855 btreeReleaseAllCursorPages(pCur); 009856 pCur->eState = CURSOR_REQUIRESEEK; 009857 } 009858 if( rc==SQLITE_EMPTY ) rc = SQLITE_OK; 009859 } 009860 } 009861 return rc; 009862 } 009863 009864 /* 009865 ** Create a new BTree table. Write into *piTable the page 009866 ** number for the root page of the new table. 009867 ** 009868 ** The type of type is determined by the flags parameter. Only the 009869 ** following values of flags are currently in use. Other values for 009870 ** flags might not work: 009871 ** 009872 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys 009873 ** BTREE_ZERODATA Used for SQL indices 009874 */ 009875 static int btreeCreateTable(Btree *p, Pgno *piTable, int createTabFlags){ 009876 BtShared *pBt = p->pBt; 009877 MemPage *pRoot; 009878 Pgno pgnoRoot; 009879 int rc; 009880 int ptfFlags; /* Page-type flags for the root page of new table */ 009881 009882 assert( sqlite3BtreeHoldsMutex(p) ); 009883 assert( pBt->inTransaction==TRANS_WRITE ); 009884 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 009885 009886 #ifdef SQLITE_OMIT_AUTOVACUUM 009887 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 009888 if( rc ){ 009889 return rc; 009890 } 009891 #else 009892 if( pBt->autoVacuum ){ 009893 Pgno pgnoMove; /* Move a page here to make room for the root-page */ 009894 MemPage *pPageMove; /* The page to move to. */ 009895 009896 /* Creating a new table may probably require moving an existing database 009897 ** to make room for the new tables root page. In case this page turns 009898 ** out to be an overflow page, delete all overflow page-map caches 009899 ** held by open cursors. 009900 */ 009901 invalidateAllOverflowCache(pBt); 009902 009903 /* Read the value of meta[3] from the database to determine where the 009904 ** root page of the new table should go. meta[3] is the largest root-page 009905 ** created so far, so the new root-page is (meta[3]+1). 009906 */ 009907 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot); 009908 if( pgnoRoot>btreePagecount(pBt) ){ 009909 return SQLITE_CORRUPT_BKPT; 009910 } 009911 pgnoRoot++; 009912 009913 /* The new root-page may not be allocated on a pointer-map page, or the 009914 ** PENDING_BYTE page. 009915 */ 009916 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) || 009917 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ 009918 pgnoRoot++; 009919 } 009920 assert( pgnoRoot>=3 ); 009921 009922 /* Allocate a page. The page that currently resides at pgnoRoot will 009923 ** be moved to the allocated page (unless the allocated page happens 009924 ** to reside at pgnoRoot). 009925 */ 009926 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT); 009927 if( rc!=SQLITE_OK ){ 009928 return rc; 009929 } 009930 009931 if( pgnoMove!=pgnoRoot ){ 009932 /* pgnoRoot is the page that will be used for the root-page of 009933 ** the new table (assuming an error did not occur). But we were 009934 ** allocated pgnoMove. If required (i.e. if it was not allocated 009935 ** by extending the file), the current page at position pgnoMove 009936 ** is already journaled. 009937 */ 009938 u8 eType = 0; 009939 Pgno iPtrPage = 0; 009940 009941 /* Save the positions of any open cursors. This is required in 009942 ** case they are holding a reference to an xFetch reference 009943 ** corresponding to page pgnoRoot. */ 009944 rc = saveAllCursors(pBt, 0, 0); 009945 releasePage(pPageMove); 009946 if( rc!=SQLITE_OK ){ 009947 return rc; 009948 } 009949 009950 /* Move the page currently at pgnoRoot to pgnoMove. */ 009951 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 009952 if( rc!=SQLITE_OK ){ 009953 return rc; 009954 } 009955 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage); 009956 if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){ 009957 rc = SQLITE_CORRUPT_BKPT; 009958 } 009959 if( rc!=SQLITE_OK ){ 009960 releasePage(pRoot); 009961 return rc; 009962 } 009963 assert( eType!=PTRMAP_ROOTPAGE ); 009964 assert( eType!=PTRMAP_FREEPAGE ); 009965 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0); 009966 releasePage(pRoot); 009967 009968 /* Obtain the page at pgnoRoot */ 009969 if( rc!=SQLITE_OK ){ 009970 return rc; 009971 } 009972 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 009973 if( rc!=SQLITE_OK ){ 009974 return rc; 009975 } 009976 rc = sqlite3PagerWrite(pRoot->pDbPage); 009977 if( rc!=SQLITE_OK ){ 009978 releasePage(pRoot); 009979 return rc; 009980 } 009981 }else{ 009982 pRoot = pPageMove; 009983 } 009984 009985 /* Update the pointer-map and meta-data with the new root-page number. */ 009986 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc); 009987 if( rc ){ 009988 releasePage(pRoot); 009989 return rc; 009990 } 009991 009992 /* When the new root page was allocated, page 1 was made writable in 009993 ** order either to increase the database filesize, or to decrement the 009994 ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail. 009995 */ 009996 assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) ); 009997 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot); 009998 if( NEVER(rc) ){ 009999 releasePage(pRoot); 010000 return rc; 010001 } 010002 010003 }else{ 010004 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 010005 if( rc ) return rc; 010006 } 010007 #endif 010008 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 010009 if( createTabFlags & BTREE_INTKEY ){ 010010 ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF; 010011 }else{ 010012 ptfFlags = PTF_ZERODATA | PTF_LEAF; 010013 } 010014 zeroPage(pRoot, ptfFlags); 010015 sqlite3PagerUnref(pRoot->pDbPage); 010016 assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 ); 010017 *piTable = pgnoRoot; 010018 return SQLITE_OK; 010019 } 010020 int sqlite3BtreeCreateTable(Btree *p, Pgno *piTable, int flags){ 010021 int rc; 010022 sqlite3BtreeEnter(p); 010023 rc = btreeCreateTable(p, piTable, flags); 010024 sqlite3BtreeLeave(p); 010025 return rc; 010026 } 010027 010028 /* 010029 ** Erase the given database page and all its children. Return 010030 ** the page to the freelist. 010031 */ 010032 static int clearDatabasePage( 010033 BtShared *pBt, /* The BTree that contains the table */ 010034 Pgno pgno, /* Page number to clear */ 010035 int freePageFlag, /* Deallocate page if true */ 010036 i64 *pnChange /* Add number of Cells freed to this counter */ 010037 ){ 010038 MemPage *pPage; 010039 int rc; 010040 unsigned char *pCell; 010041 int i; 010042 int hdr; 010043 CellInfo info; 010044 010045 assert( sqlite3_mutex_held(pBt->mutex) ); 010046 if( pgno>btreePagecount(pBt) ){ 010047 return SQLITE_CORRUPT_BKPT; 010048 } 010049 rc = getAndInitPage(pBt, pgno, &pPage, 0); 010050 if( rc ) return rc; 010051 if( (pBt->openFlags & BTREE_SINGLE)==0 010052 && sqlite3PagerPageRefcount(pPage->pDbPage) != (1 + (pgno==1)) 010053 ){ 010054 rc = SQLITE_CORRUPT_BKPT; 010055 goto cleardatabasepage_out; 010056 } 010057 hdr = pPage->hdrOffset; 010058 for(i=0; i<pPage->nCell; i++){ 010059 pCell = findCell(pPage, i); 010060 if( !pPage->leaf ){ 010061 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange); 010062 if( rc ) goto cleardatabasepage_out; 010063 } 010064 BTREE_CLEAR_CELL(rc, pPage, pCell, info); 010065 if( rc ) goto cleardatabasepage_out; 010066 } 010067 if( !pPage->leaf ){ 010068 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange); 010069 if( rc ) goto cleardatabasepage_out; 010070 if( pPage->intKey ) pnChange = 0; 010071 } 010072 if( pnChange ){ 010073 testcase( !pPage->intKey ); 010074 *pnChange += pPage->nCell; 010075 } 010076 if( freePageFlag ){ 010077 freePage(pPage, &rc); 010078 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){ 010079 zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF); 010080 } 010081 010082 cleardatabasepage_out: 010083 releasePage(pPage); 010084 return rc; 010085 } 010086 010087 /* 010088 ** Delete all information from a single table in the database. iTable is 010089 ** the page number of the root of the table. After this routine returns, 010090 ** the root page is empty, but still exists. 010091 ** 010092 ** This routine will fail with SQLITE_LOCKED if there are any open 010093 ** read cursors on the table. Open write cursors are moved to the 010094 ** root of the table. 010095 ** 010096 ** If pnChange is not NULL, then the integer value pointed to by pnChange 010097 ** is incremented by the number of entries in the table. 010098 */ 010099 int sqlite3BtreeClearTable(Btree *p, int iTable, i64 *pnChange){ 010100 int rc; 010101 BtShared *pBt = p->pBt; 010102 sqlite3BtreeEnter(p); 010103 assert( p->inTrans==TRANS_WRITE ); 010104 010105 rc = saveAllCursors(pBt, (Pgno)iTable, 0); 010106 010107 if( SQLITE_OK==rc ){ 010108 /* Invalidate all incrblob cursors open on table iTable (assuming iTable 010109 ** is the root of a table b-tree - if it is not, the following call is 010110 ** a no-op). */ 010111 if( p->hasIncrblobCur ){ 010112 invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1); 010113 } 010114 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange); 010115 } 010116 sqlite3BtreeLeave(p); 010117 return rc; 010118 } 010119 010120 /* 010121 ** Delete all information from the single table that pCur is open on. 010122 ** 010123 ** This routine only work for pCur on an ephemeral table. 010124 */ 010125 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){ 010126 return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0); 010127 } 010128 010129 /* 010130 ** Erase all information in a table and add the root of the table to 010131 ** the freelist. Except, the root of the principle table (the one on 010132 ** page 1) is never added to the freelist. 010133 ** 010134 ** This routine will fail with SQLITE_LOCKED if there are any open 010135 ** cursors on the table. 010136 ** 010137 ** If AUTOVACUUM is enabled and the page at iTable is not the last 010138 ** root page in the database file, then the last root page 010139 ** in the database file is moved into the slot formerly occupied by 010140 ** iTable and that last slot formerly occupied by the last root page 010141 ** is added to the freelist instead of iTable. In this say, all 010142 ** root pages are kept at the beginning of the database file, which 010143 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the 010144 ** page number that used to be the last root page in the file before 010145 ** the move. If no page gets moved, *piMoved is set to 0. 010146 ** The last root page is recorded in meta[3] and the value of 010147 ** meta[3] is updated by this procedure. 010148 */ 010149 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){ 010150 int rc; 010151 MemPage *pPage = 0; 010152 BtShared *pBt = p->pBt; 010153 010154 assert( sqlite3BtreeHoldsMutex(p) ); 010155 assert( p->inTrans==TRANS_WRITE ); 010156 assert( iTable>=2 ); 010157 if( iTable>btreePagecount(pBt) ){ 010158 return SQLITE_CORRUPT_BKPT; 010159 } 010160 010161 rc = sqlite3BtreeClearTable(p, iTable, 0); 010162 if( rc ) return rc; 010163 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0); 010164 if( NEVER(rc) ){ 010165 releasePage(pPage); 010166 return rc; 010167 } 010168 010169 *piMoved = 0; 010170 010171 #ifdef SQLITE_OMIT_AUTOVACUUM 010172 freePage(pPage, &rc); 010173 releasePage(pPage); 010174 #else 010175 if( pBt->autoVacuum ){ 010176 Pgno maxRootPgno; 010177 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno); 010178 010179 if( iTable==maxRootPgno ){ 010180 /* If the table being dropped is the table with the largest root-page 010181 ** number in the database, put the root page on the free list. 010182 */ 010183 freePage(pPage, &rc); 010184 releasePage(pPage); 010185 if( rc!=SQLITE_OK ){ 010186 return rc; 010187 } 010188 }else{ 010189 /* The table being dropped does not have the largest root-page 010190 ** number in the database. So move the page that does into the 010191 ** gap left by the deleted root-page. 010192 */ 010193 MemPage *pMove; 010194 releasePage(pPage); 010195 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 010196 if( rc!=SQLITE_OK ){ 010197 return rc; 010198 } 010199 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0); 010200 releasePage(pMove); 010201 if( rc!=SQLITE_OK ){ 010202 return rc; 010203 } 010204 pMove = 0; 010205 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 010206 freePage(pMove, &rc); 010207 releasePage(pMove); 010208 if( rc!=SQLITE_OK ){ 010209 return rc; 010210 } 010211 *piMoved = maxRootPgno; 010212 } 010213 010214 /* Set the new 'max-root-page' value in the database header. This 010215 ** is the old value less one, less one more if that happens to 010216 ** be a root-page number, less one again if that is the 010217 ** PENDING_BYTE_PAGE. 010218 */ 010219 maxRootPgno--; 010220 while( maxRootPgno==PENDING_BYTE_PAGE(pBt) 010221 || PTRMAP_ISPAGE(pBt, maxRootPgno) ){ 010222 maxRootPgno--; 010223 } 010224 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) ); 010225 010226 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno); 010227 }else{ 010228 freePage(pPage, &rc); 010229 releasePage(pPage); 010230 } 010231 #endif 010232 return rc; 010233 } 010234 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ 010235 int rc; 010236 sqlite3BtreeEnter(p); 010237 rc = btreeDropTable(p, iTable, piMoved); 010238 sqlite3BtreeLeave(p); 010239 return rc; 010240 } 010241 010242 010243 /* 010244 ** This function may only be called if the b-tree connection already 010245 ** has a read or write transaction open on the database. 010246 ** 010247 ** Read the meta-information out of a database file. Meta[0] 010248 ** is the number of free pages currently in the database. Meta[1] 010249 ** through meta[15] are available for use by higher layers. Meta[0] 010250 ** is read-only, the others are read/write. 010251 ** 010252 ** The schema layer numbers meta values differently. At the schema 010253 ** layer (and the SetCookie and ReadCookie opcodes) the number of 010254 ** free pages is not visible. So Cookie[0] is the same as Meta[1]. 010255 ** 010256 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case. Instead 010257 ** of reading the value out of the header, it instead loads the "DataVersion" 010258 ** from the pager. The BTREE_DATA_VERSION value is not actually stored in the 010259 ** database file. It is a number computed by the pager. But its access 010260 ** pattern is the same as header meta values, and so it is convenient to 010261 ** read it from this routine. 010262 */ 010263 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ 010264 BtShared *pBt = p->pBt; 010265 010266 sqlite3BtreeEnter(p); 010267 assert( p->inTrans>TRANS_NONE ); 010268 assert( SQLITE_OK==querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK) ); 010269 assert( pBt->pPage1 ); 010270 assert( idx>=0 && idx<=15 ); 010271 010272 if( idx==BTREE_DATA_VERSION ){ 010273 *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iBDataVersion; 010274 }else{ 010275 *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]); 010276 } 010277 010278 /* If auto-vacuum is disabled in this build and this is an auto-vacuum 010279 ** database, mark the database as read-only. */ 010280 #ifdef SQLITE_OMIT_AUTOVACUUM 010281 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){ 010282 pBt->btsFlags |= BTS_READ_ONLY; 010283 } 010284 #endif 010285 010286 sqlite3BtreeLeave(p); 010287 } 010288 010289 /* 010290 ** Write meta-information back into the database. Meta[0] is 010291 ** read-only and may not be written. 010292 */ 010293 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ 010294 BtShared *pBt = p->pBt; 010295 unsigned char *pP1; 010296 int rc; 010297 assert( idx>=1 && idx<=15 ); 010298 sqlite3BtreeEnter(p); 010299 assert( p->inTrans==TRANS_WRITE ); 010300 assert( pBt->pPage1!=0 ); 010301 pP1 = pBt->pPage1->aData; 010302 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 010303 if( rc==SQLITE_OK ){ 010304 put4byte(&pP1[36 + idx*4], iMeta); 010305 #ifndef SQLITE_OMIT_AUTOVACUUM 010306 if( idx==BTREE_INCR_VACUUM ){ 010307 assert( pBt->autoVacuum || iMeta==0 ); 010308 assert( iMeta==0 || iMeta==1 ); 010309 pBt->incrVacuum = (u8)iMeta; 010310 } 010311 #endif 010312 } 010313 sqlite3BtreeLeave(p); 010314 return rc; 010315 } 010316 010317 /* 010318 ** The first argument, pCur, is a cursor opened on some b-tree. Count the 010319 ** number of entries in the b-tree and write the result to *pnEntry. 010320 ** 010321 ** SQLITE_OK is returned if the operation is successfully executed. 010322 ** Otherwise, if an error is encountered (i.e. an IO error or database 010323 ** corruption) an SQLite error code is returned. 010324 */ 010325 int sqlite3BtreeCount(sqlite3 *db, BtCursor *pCur, i64 *pnEntry){ 010326 i64 nEntry = 0; /* Value to return in *pnEntry */ 010327 int rc; /* Return code */ 010328 010329 rc = moveToRoot(pCur); 010330 if( rc==SQLITE_EMPTY ){ 010331 *pnEntry = 0; 010332 return SQLITE_OK; 010333 } 010334 010335 /* Unless an error occurs, the following loop runs one iteration for each 010336 ** page in the B-Tree structure (not including overflow pages). 010337 */ 010338 while( rc==SQLITE_OK && !AtomicLoad(&db->u1.isInterrupted) ){ 010339 int iIdx; /* Index of child node in parent */ 010340 MemPage *pPage; /* Current page of the b-tree */ 010341 010342 /* If this is a leaf page or the tree is not an int-key tree, then 010343 ** this page contains countable entries. Increment the entry counter 010344 ** accordingly. 010345 */ 010346 pPage = pCur->pPage; 010347 if( pPage->leaf || !pPage->intKey ){ 010348 nEntry += pPage->nCell; 010349 } 010350 010351 /* pPage is a leaf node. This loop navigates the cursor so that it 010352 ** points to the first interior cell that it points to the parent of 010353 ** the next page in the tree that has not yet been visited. The 010354 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell 010355 ** of the page, or to the number of cells in the page if the next page 010356 ** to visit is the right-child of its parent. 010357 ** 010358 ** If all pages in the tree have been visited, return SQLITE_OK to the 010359 ** caller. 010360 */ 010361 if( pPage->leaf ){ 010362 do { 010363 if( pCur->iPage==0 ){ 010364 /* All pages of the b-tree have been visited. Return successfully. */ 010365 *pnEntry = nEntry; 010366 return moveToRoot(pCur); 010367 } 010368 moveToParent(pCur); 010369 }while ( pCur->ix>=pCur->pPage->nCell ); 010370 010371 pCur->ix++; 010372 pPage = pCur->pPage; 010373 } 010374 010375 /* Descend to the child node of the cell that the cursor currently 010376 ** points at. This is the right-child if (iIdx==pPage->nCell). 010377 */ 010378 iIdx = pCur->ix; 010379 if( iIdx==pPage->nCell ){ 010380 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 010381 }else{ 010382 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx))); 010383 } 010384 } 010385 010386 /* An error has occurred. Return an error code. */ 010387 return rc; 010388 } 010389 010390 /* 010391 ** Return the pager associated with a BTree. This routine is used for 010392 ** testing and debugging only. 010393 */ 010394 Pager *sqlite3BtreePager(Btree *p){ 010395 return p->pBt->pPager; 010396 } 010397 010398 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 010399 /* 010400 ** Record an OOM error during integrity_check 010401 */ 010402 static void checkOom(IntegrityCk *pCheck){ 010403 pCheck->rc = SQLITE_NOMEM; 010404 pCheck->mxErr = 0; /* Causes integrity_check processing to stop */ 010405 if( pCheck->nErr==0 ) pCheck->nErr++; 010406 } 010407 010408 /* 010409 ** Invoke the progress handler, if appropriate. Also check for an 010410 ** interrupt. 010411 */ 010412 static void checkProgress(IntegrityCk *pCheck){ 010413 sqlite3 *db = pCheck->db; 010414 if( AtomicLoad(&db->u1.isInterrupted) ){ 010415 pCheck->rc = SQLITE_INTERRUPT; 010416 pCheck->nErr++; 010417 pCheck->mxErr = 0; 010418 } 010419 #ifndef SQLITE_OMIT_PROGRESS_CALLBACK 010420 if( db->xProgress ){ 010421 assert( db->nProgressOps>0 ); 010422 pCheck->nStep++; 010423 if( (pCheck->nStep % db->nProgressOps)==0 010424 && db->xProgress(db->pProgressArg) 010425 ){ 010426 pCheck->rc = SQLITE_INTERRUPT; 010427 pCheck->nErr++; 010428 pCheck->mxErr = 0; 010429 } 010430 } 010431 #endif 010432 } 010433 010434 /* 010435 ** Append a message to the error message string. 010436 */ 010437 static void checkAppendMsg( 010438 IntegrityCk *pCheck, 010439 const char *zFormat, 010440 ... 010441 ){ 010442 va_list ap; 010443 checkProgress(pCheck); 010444 if( !pCheck->mxErr ) return; 010445 pCheck->mxErr--; 010446 pCheck->nErr++; 010447 va_start(ap, zFormat); 010448 if( pCheck->errMsg.nChar ){ 010449 sqlite3_str_append(&pCheck->errMsg, "\n", 1); 010450 } 010451 if( pCheck->zPfx ){ 010452 sqlite3_str_appendf(&pCheck->errMsg, pCheck->zPfx, 010453 pCheck->v0, pCheck->v1, pCheck->v2); 010454 } 010455 sqlite3_str_vappendf(&pCheck->errMsg, zFormat, ap); 010456 va_end(ap); 010457 if( pCheck->errMsg.accError==SQLITE_NOMEM ){ 010458 checkOom(pCheck); 010459 } 010460 } 010461 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 010462 010463 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 010464 010465 /* 010466 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that 010467 ** corresponds to page iPg is already set. 010468 */ 010469 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){ 010470 assert( pCheck->aPgRef!=0 ); 010471 assert( iPg<=pCheck->nCkPage && sizeof(pCheck->aPgRef[0])==1 ); 010472 return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07))); 010473 } 010474 010475 /* 010476 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg. 010477 */ 010478 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){ 010479 assert( pCheck->aPgRef!=0 ); 010480 assert( iPg<=pCheck->nCkPage && sizeof(pCheck->aPgRef[0])==1 ); 010481 pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07)); 010482 } 010483 010484 010485 /* 010486 ** Add 1 to the reference count for page iPage. If this is the second 010487 ** reference to the page, add an error message to pCheck->zErrMsg. 010488 ** Return 1 if there are 2 or more references to the page and 0 if 010489 ** if this is the first reference to the page. 010490 ** 010491 ** Also check that the page number is in bounds. 010492 */ 010493 static int checkRef(IntegrityCk *pCheck, Pgno iPage){ 010494 if( iPage>pCheck->nCkPage || iPage==0 ){ 010495 checkAppendMsg(pCheck, "invalid page number %u", iPage); 010496 return 1; 010497 } 010498 if( getPageReferenced(pCheck, iPage) ){ 010499 checkAppendMsg(pCheck, "2nd reference to page %u", iPage); 010500 return 1; 010501 } 010502 setPageReferenced(pCheck, iPage); 010503 return 0; 010504 } 010505 010506 #ifndef SQLITE_OMIT_AUTOVACUUM 010507 /* 010508 ** Check that the entry in the pointer-map for page iChild maps to 010509 ** page iParent, pointer type ptrType. If not, append an error message 010510 ** to pCheck. 010511 */ 010512 static void checkPtrmap( 010513 IntegrityCk *pCheck, /* Integrity check context */ 010514 Pgno iChild, /* Child page number */ 010515 u8 eType, /* Expected pointer map type */ 010516 Pgno iParent /* Expected pointer map parent page number */ 010517 ){ 010518 int rc; 010519 u8 ePtrmapType; 010520 Pgno iPtrmapParent; 010521 010522 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); 010523 if( rc!=SQLITE_OK ){ 010524 if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) checkOom(pCheck); 010525 checkAppendMsg(pCheck, "Failed to read ptrmap key=%u", iChild); 010526 return; 010527 } 010528 010529 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){ 010530 checkAppendMsg(pCheck, 010531 "Bad ptr map entry key=%u expected=(%u,%u) got=(%u,%u)", 010532 iChild, eType, iParent, ePtrmapType, iPtrmapParent); 010533 } 010534 } 010535 #endif 010536 010537 /* 010538 ** Check the integrity of the freelist or of an overflow page list. 010539 ** Verify that the number of pages on the list is N. 010540 */ 010541 static void checkList( 010542 IntegrityCk *pCheck, /* Integrity checking context */ 010543 int isFreeList, /* True for a freelist. False for overflow page list */ 010544 Pgno iPage, /* Page number for first page in the list */ 010545 u32 N /* Expected number of pages in the list */ 010546 ){ 010547 int i; 010548 u32 expected = N; 010549 int nErrAtStart = pCheck->nErr; 010550 while( iPage!=0 && pCheck->mxErr ){ 010551 DbPage *pOvflPage; 010552 unsigned char *pOvflData; 010553 if( checkRef(pCheck, iPage) ) break; 010554 N--; 010555 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){ 010556 checkAppendMsg(pCheck, "failed to get page %u", iPage); 010557 break; 010558 } 010559 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); 010560 if( isFreeList ){ 010561 u32 n = (u32)get4byte(&pOvflData[4]); 010562 #ifndef SQLITE_OMIT_AUTOVACUUM 010563 if( pCheck->pBt->autoVacuum ){ 010564 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0); 010565 } 010566 #endif 010567 if( n>pCheck->pBt->usableSize/4-2 ){ 010568 checkAppendMsg(pCheck, 010569 "freelist leaf count too big on page %u", iPage); 010570 N--; 010571 }else{ 010572 for(i=0; i<(int)n; i++){ 010573 Pgno iFreePage = get4byte(&pOvflData[8+i*4]); 010574 #ifndef SQLITE_OMIT_AUTOVACUUM 010575 if( pCheck->pBt->autoVacuum ){ 010576 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0); 010577 } 010578 #endif 010579 checkRef(pCheck, iFreePage); 010580 } 010581 N -= n; 010582 } 010583 } 010584 #ifndef SQLITE_OMIT_AUTOVACUUM 010585 else{ 010586 /* If this database supports auto-vacuum and iPage is not the last 010587 ** page in this overflow list, check that the pointer-map entry for 010588 ** the following page matches iPage. 010589 */ 010590 if( pCheck->pBt->autoVacuum && N>0 ){ 010591 i = get4byte(pOvflData); 010592 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage); 010593 } 010594 } 010595 #endif 010596 iPage = get4byte(pOvflData); 010597 sqlite3PagerUnref(pOvflPage); 010598 } 010599 if( N && nErrAtStart==pCheck->nErr ){ 010600 checkAppendMsg(pCheck, 010601 "%s is %u but should be %u", 010602 isFreeList ? "size" : "overflow list length", 010603 expected-N, expected); 010604 } 010605 } 010606 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 010607 010608 /* 010609 ** An implementation of a min-heap. 010610 ** 010611 ** aHeap[0] is the number of elements on the heap. aHeap[1] is the 010612 ** root element. The daughter nodes of aHeap[N] are aHeap[N*2] 010613 ** and aHeap[N*2+1]. 010614 ** 010615 ** The heap property is this: Every node is less than or equal to both 010616 ** of its daughter nodes. A consequence of the heap property is that the 010617 ** root node aHeap[1] is always the minimum value currently in the heap. 010618 ** 010619 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto 010620 ** the heap, preserving the heap property. The btreeHeapPull() routine 010621 ** removes the root element from the heap (the minimum value in the heap) 010622 ** and then moves other nodes around as necessary to preserve the heap 010623 ** property. 010624 ** 010625 ** This heap is used for cell overlap and coverage testing. Each u32 010626 ** entry represents the span of a cell or freeblock on a btree page. 010627 ** The upper 16 bits are the index of the first byte of a range and the 010628 ** lower 16 bits are the index of the last byte of that range. 010629 */ 010630 static void btreeHeapInsert(u32 *aHeap, u32 x){ 010631 u32 j, i; 010632 assert( aHeap!=0 ); 010633 i = ++aHeap[0]; 010634 aHeap[i] = x; 010635 while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){ 010636 x = aHeap[j]; 010637 aHeap[j] = aHeap[i]; 010638 aHeap[i] = x; 010639 i = j; 010640 } 010641 } 010642 static int btreeHeapPull(u32 *aHeap, u32 *pOut){ 010643 u32 j, i, x; 010644 if( (x = aHeap[0])==0 ) return 0; 010645 *pOut = aHeap[1]; 010646 aHeap[1] = aHeap[x]; 010647 aHeap[x] = 0xffffffff; 010648 aHeap[0]--; 010649 i = 1; 010650 while( (j = i*2)<=aHeap[0] ){ 010651 if( aHeap[j]>aHeap[j+1] ) j++; 010652 if( aHeap[i]<aHeap[j] ) break; 010653 x = aHeap[i]; 010654 aHeap[i] = aHeap[j]; 010655 aHeap[j] = x; 010656 i = j; 010657 } 010658 return 1; 010659 } 010660 010661 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 010662 /* 010663 ** Do various sanity checks on a single page of a tree. Return 010664 ** the tree depth. Root pages return 0. Parents of root pages 010665 ** return 1, and so forth. 010666 ** 010667 ** These checks are done: 010668 ** 010669 ** 1. Make sure that cells and freeblocks do not overlap 010670 ** but combine to completely cover the page. 010671 ** 2. Make sure integer cell keys are in order. 010672 ** 3. Check the integrity of overflow pages. 010673 ** 4. Recursively call checkTreePage on all children. 010674 ** 5. Verify that the depth of all children is the same. 010675 */ 010676 static int checkTreePage( 010677 IntegrityCk *pCheck, /* Context for the sanity check */ 010678 Pgno iPage, /* Page number of the page to check */ 010679 i64 *piMinKey, /* Write minimum integer primary key here */ 010680 i64 maxKey /* Error if integer primary key greater than this */ 010681 ){ 010682 MemPage *pPage = 0; /* The page being analyzed */ 010683 int i; /* Loop counter */ 010684 int rc; /* Result code from subroutine call */ 010685 int depth = -1, d2; /* Depth of a subtree */ 010686 int pgno; /* Page number */ 010687 int nFrag; /* Number of fragmented bytes on the page */ 010688 int hdr; /* Offset to the page header */ 010689 int cellStart; /* Offset to the start of the cell pointer array */ 010690 int nCell; /* Number of cells */ 010691 int doCoverageCheck = 1; /* True if cell coverage checking should be done */ 010692 int keyCanBeEqual = 1; /* True if IPK can be equal to maxKey 010693 ** False if IPK must be strictly less than maxKey */ 010694 u8 *data; /* Page content */ 010695 u8 *pCell; /* Cell content */ 010696 u8 *pCellIdx; /* Next element of the cell pointer array */ 010697 BtShared *pBt; /* The BtShared object that owns pPage */ 010698 u32 pc; /* Address of a cell */ 010699 u32 usableSize; /* Usable size of the page */ 010700 u32 contentOffset; /* Offset to the start of the cell content area */ 010701 u32 *heap = 0; /* Min-heap used for checking cell coverage */ 010702 u32 x, prev = 0; /* Next and previous entry on the min-heap */ 010703 const char *saved_zPfx = pCheck->zPfx; 010704 int saved_v1 = pCheck->v1; 010705 int saved_v2 = pCheck->v2; 010706 u8 savedIsInit = 0; 010707 010708 /* Check that the page exists 010709 */ 010710 checkProgress(pCheck); 010711 if( pCheck->mxErr==0 ) goto end_of_check; 010712 pBt = pCheck->pBt; 010713 usableSize = pBt->usableSize; 010714 if( iPage==0 ) return 0; 010715 if( checkRef(pCheck, iPage) ) return 0; 010716 pCheck->zPfx = "Tree %u page %u: "; 010717 pCheck->v1 = iPage; 010718 if( (rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0 ){ 010719 checkAppendMsg(pCheck, 010720 "unable to get the page. error code=%d", rc); 010721 if( rc==SQLITE_IOERR_NOMEM ) pCheck->rc = SQLITE_NOMEM; 010722 goto end_of_check; 010723 } 010724 010725 /* Clear MemPage.isInit to make sure the corruption detection code in 010726 ** btreeInitPage() is executed. */ 010727 savedIsInit = pPage->isInit; 010728 pPage->isInit = 0; 010729 if( (rc = btreeInitPage(pPage))!=0 ){ 010730 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */ 010731 checkAppendMsg(pCheck, 010732 "btreeInitPage() returns error code %d", rc); 010733 goto end_of_check; 010734 } 010735 if( (rc = btreeComputeFreeSpace(pPage))!=0 ){ 010736 assert( rc==SQLITE_CORRUPT ); 010737 checkAppendMsg(pCheck, "free space corruption", rc); 010738 goto end_of_check; 010739 } 010740 data = pPage->aData; 010741 hdr = pPage->hdrOffset; 010742 010743 /* Set up for cell analysis */ 010744 pCheck->zPfx = "Tree %u page %u cell %u: "; 010745 contentOffset = get2byteNotZero(&data[hdr+5]); 010746 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */ 010747 010748 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the 010749 ** number of cells on the page. */ 010750 nCell = get2byte(&data[hdr+3]); 010751 assert( pPage->nCell==nCell ); 010752 010753 /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page 010754 ** immediately follows the b-tree page header. */ 010755 cellStart = hdr + 12 - 4*pPage->leaf; 010756 assert( pPage->aCellIdx==&data[cellStart] ); 010757 pCellIdx = &data[cellStart + 2*(nCell-1)]; 010758 010759 if( !pPage->leaf ){ 010760 /* Analyze the right-child page of internal pages */ 010761 pgno = get4byte(&data[hdr+8]); 010762 #ifndef SQLITE_OMIT_AUTOVACUUM 010763 if( pBt->autoVacuum ){ 010764 pCheck->zPfx = "Tree %u page %u right child: "; 010765 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); 010766 } 010767 #endif 010768 depth = checkTreePage(pCheck, pgno, &maxKey, maxKey); 010769 keyCanBeEqual = 0; 010770 }else{ 010771 /* For leaf pages, the coverage check will occur in the same loop 010772 ** as the other cell checks, so initialize the heap. */ 010773 heap = pCheck->heap; 010774 heap[0] = 0; 010775 } 010776 010777 /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte 010778 ** integer offsets to the cell contents. */ 010779 for(i=nCell-1; i>=0 && pCheck->mxErr; i--){ 010780 CellInfo info; 010781 010782 /* Check cell size */ 010783 pCheck->v2 = i; 010784 assert( pCellIdx==&data[cellStart + i*2] ); 010785 pc = get2byteAligned(pCellIdx); 010786 pCellIdx -= 2; 010787 if( pc<contentOffset || pc>usableSize-4 ){ 010788 checkAppendMsg(pCheck, "Offset %u out of range %u..%u", 010789 pc, contentOffset, usableSize-4); 010790 doCoverageCheck = 0; 010791 continue; 010792 } 010793 pCell = &data[pc]; 010794 pPage->xParseCell(pPage, pCell, &info); 010795 if( pc+info.nSize>usableSize ){ 010796 checkAppendMsg(pCheck, "Extends off end of page"); 010797 doCoverageCheck = 0; 010798 continue; 010799 } 010800 010801 /* Check for integer primary key out of range */ 010802 if( pPage->intKey ){ 010803 if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){ 010804 checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey); 010805 } 010806 maxKey = info.nKey; 010807 keyCanBeEqual = 0; /* Only the first key on the page may ==maxKey */ 010808 } 010809 010810 /* Check the content overflow list */ 010811 if( info.nPayload>info.nLocal ){ 010812 u32 nPage; /* Number of pages on the overflow chain */ 010813 Pgno pgnoOvfl; /* First page of the overflow chain */ 010814 assert( pc + info.nSize - 4 <= usableSize ); 010815 nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4); 010816 pgnoOvfl = get4byte(&pCell[info.nSize - 4]); 010817 #ifndef SQLITE_OMIT_AUTOVACUUM 010818 if( pBt->autoVacuum ){ 010819 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage); 010820 } 010821 #endif 010822 checkList(pCheck, 0, pgnoOvfl, nPage); 010823 } 010824 010825 if( !pPage->leaf ){ 010826 /* Check sanity of left child page for internal pages */ 010827 pgno = get4byte(pCell); 010828 #ifndef SQLITE_OMIT_AUTOVACUUM 010829 if( pBt->autoVacuum ){ 010830 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); 010831 } 010832 #endif 010833 d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey); 010834 keyCanBeEqual = 0; 010835 if( d2!=depth ){ 010836 checkAppendMsg(pCheck, "Child page depth differs"); 010837 depth = d2; 010838 } 010839 }else{ 010840 /* Populate the coverage-checking heap for leaf pages */ 010841 btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1)); 010842 } 010843 } 010844 *piMinKey = maxKey; 010845 010846 /* Check for complete coverage of the page 010847 */ 010848 pCheck->zPfx = 0; 010849 if( doCoverageCheck && pCheck->mxErr>0 ){ 010850 /* For leaf pages, the min-heap has already been initialized and the 010851 ** cells have already been inserted. But for internal pages, that has 010852 ** not yet been done, so do it now */ 010853 if( !pPage->leaf ){ 010854 heap = pCheck->heap; 010855 heap[0] = 0; 010856 for(i=nCell-1; i>=0; i--){ 010857 u32 size; 010858 pc = get2byteAligned(&data[cellStart+i*2]); 010859 size = pPage->xCellSize(pPage, &data[pc]); 010860 btreeHeapInsert(heap, (pc<<16)|(pc+size-1)); 010861 } 010862 } 010863 /* Add the freeblocks to the min-heap 010864 ** 010865 ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header 010866 ** is the offset of the first freeblock, or zero if there are no 010867 ** freeblocks on the page. 010868 */ 010869 i = get2byte(&data[hdr+1]); 010870 while( i>0 ){ 010871 int size, j; 010872 assert( (u32)i<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */ 010873 size = get2byte(&data[i+2]); 010874 assert( (u32)(i+size)<=usableSize ); /* due to btreeComputeFreeSpace() */ 010875 btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1)); 010876 /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a 010877 ** big-endian integer which is the offset in the b-tree page of the next 010878 ** freeblock in the chain, or zero if the freeblock is the last on the 010879 ** chain. */ 010880 j = get2byte(&data[i]); 010881 /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of 010882 ** increasing offset. */ 010883 assert( j==0 || j>i+size ); /* Enforced by btreeComputeFreeSpace() */ 010884 assert( (u32)j<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */ 010885 i = j; 010886 } 010887 /* Analyze the min-heap looking for overlap between cells and/or 010888 ** freeblocks, and counting the number of untracked bytes in nFrag. 010889 ** 010890 ** Each min-heap entry is of the form: (start_address<<16)|end_address. 010891 ** There is an implied first entry the covers the page header, the cell 010892 ** pointer index, and the gap between the cell pointer index and the start 010893 ** of cell content. 010894 ** 010895 ** The loop below pulls entries from the min-heap in order and compares 010896 ** the start_address against the previous end_address. If there is an 010897 ** overlap, that means bytes are used multiple times. If there is a gap, 010898 ** that gap is added to the fragmentation count. 010899 */ 010900 nFrag = 0; 010901 prev = contentOffset - 1; /* Implied first min-heap entry */ 010902 while( btreeHeapPull(heap,&x) ){ 010903 if( (prev&0xffff)>=(x>>16) ){ 010904 checkAppendMsg(pCheck, 010905 "Multiple uses for byte %u of page %u", x>>16, iPage); 010906 break; 010907 }else{ 010908 nFrag += (x>>16) - (prev&0xffff) - 1; 010909 prev = x; 010910 } 010911 } 010912 nFrag += usableSize - (prev&0xffff) - 1; 010913 /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments 010914 ** is stored in the fifth field of the b-tree page header. 010915 ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the 010916 ** number of fragmented free bytes within the cell content area. 010917 */ 010918 if( heap[0]==0 && nFrag!=data[hdr+7] ){ 010919 checkAppendMsg(pCheck, 010920 "Fragmentation of %u bytes reported as %u on page %u", 010921 nFrag, data[hdr+7], iPage); 010922 } 010923 } 010924 010925 end_of_check: 010926 if( !doCoverageCheck ) pPage->isInit = savedIsInit; 010927 releasePage(pPage); 010928 pCheck->zPfx = saved_zPfx; 010929 pCheck->v1 = saved_v1; 010930 pCheck->v2 = saved_v2; 010931 return depth+1; 010932 } 010933 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 010934 010935 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 010936 /* 010937 ** This routine does a complete check of the given BTree file. aRoot[] is 010938 ** an array of pages numbers were each page number is the root page of 010939 ** a table. nRoot is the number of entries in aRoot. 010940 ** 010941 ** A read-only or read-write transaction must be opened before calling 010942 ** this function. 010943 ** 010944 ** Write the number of error seen in *pnErr. Except for some memory 010945 ** allocation errors, an error message held in memory obtained from 010946 ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is 010947 ** returned. If a memory allocation error occurs, NULL is returned. 010948 ** 010949 ** If the first entry in aRoot[] is 0, that indicates that the list of 010950 ** root pages is incomplete. This is a "partial integrity-check". This 010951 ** happens when performing an integrity check on a single table. The 010952 ** zero is skipped, of course. But in addition, the freelist checks 010953 ** and the checks to make sure every page is referenced are also skipped, 010954 ** since obviously it is not possible to know which pages are covered by 010955 ** the unverified btrees. Except, if aRoot[1] is 1, then the freelist 010956 ** checks are still performed. 010957 */ 010958 int sqlite3BtreeIntegrityCheck( 010959 sqlite3 *db, /* Database connection that is running the check */ 010960 Btree *p, /* The btree to be checked */ 010961 Pgno *aRoot, /* An array of root pages numbers for individual trees */ 010962 int nRoot, /* Number of entries in aRoot[] */ 010963 int mxErr, /* Stop reporting errors after this many */ 010964 int *pnErr, /* OUT: Write number of errors seen to this variable */ 010965 char **pzOut /* OUT: Write the error message string here */ 010966 ){ 010967 Pgno i; 010968 IntegrityCk sCheck; 010969 BtShared *pBt = p->pBt; 010970 u64 savedDbFlags = pBt->db->flags; 010971 char zErr[100]; 010972 int bPartial = 0; /* True if not checking all btrees */ 010973 int bCkFreelist = 1; /* True to scan the freelist */ 010974 VVA_ONLY( int nRef ); 010975 assert( nRoot>0 ); 010976 010977 /* aRoot[0]==0 means this is a partial check */ 010978 if( aRoot[0]==0 ){ 010979 assert( nRoot>1 ); 010980 bPartial = 1; 010981 if( aRoot[1]!=1 ) bCkFreelist = 0; 010982 } 010983 010984 sqlite3BtreeEnter(p); 010985 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE ); 010986 VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) ); 010987 assert( nRef>=0 ); 010988 memset(&sCheck, 0, sizeof(sCheck)); 010989 sCheck.db = db; 010990 sCheck.pBt = pBt; 010991 sCheck.pPager = pBt->pPager; 010992 sCheck.nCkPage = btreePagecount(sCheck.pBt); 010993 sCheck.mxErr = mxErr; 010994 sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH); 010995 sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL; 010996 if( sCheck.nCkPage==0 ){ 010997 goto integrity_ck_cleanup; 010998 } 010999 011000 sCheck.aPgRef = sqlite3MallocZero((sCheck.nCkPage / 8)+ 1); 011001 if( !sCheck.aPgRef ){ 011002 checkOom(&sCheck); 011003 goto integrity_ck_cleanup; 011004 } 011005 sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize ); 011006 if( sCheck.heap==0 ){ 011007 checkOom(&sCheck); 011008 goto integrity_ck_cleanup; 011009 } 011010 011011 i = PENDING_BYTE_PAGE(pBt); 011012 if( i<=sCheck.nCkPage ) setPageReferenced(&sCheck, i); 011013 011014 /* Check the integrity of the freelist 011015 */ 011016 if( bCkFreelist ){ 011017 sCheck.zPfx = "Freelist: "; 011018 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), 011019 get4byte(&pBt->pPage1->aData[36])); 011020 sCheck.zPfx = 0; 011021 } 011022 011023 /* Check all the tables. 011024 */ 011025 #ifndef SQLITE_OMIT_AUTOVACUUM 011026 if( !bPartial ){ 011027 if( pBt->autoVacuum ){ 011028 Pgno mx = 0; 011029 Pgno mxInHdr; 011030 for(i=0; (int)i<nRoot; i++) if( mx<aRoot[i] ) mx = aRoot[i]; 011031 mxInHdr = get4byte(&pBt->pPage1->aData[52]); 011032 if( mx!=mxInHdr ){ 011033 checkAppendMsg(&sCheck, 011034 "max rootpage (%u) disagrees with header (%u)", 011035 mx, mxInHdr 011036 ); 011037 } 011038 }else if( get4byte(&pBt->pPage1->aData[64])!=0 ){ 011039 checkAppendMsg(&sCheck, 011040 "incremental_vacuum enabled with a max rootpage of zero" 011041 ); 011042 } 011043 } 011044 #endif 011045 testcase( pBt->db->flags & SQLITE_CellSizeCk ); 011046 pBt->db->flags &= ~(u64)SQLITE_CellSizeCk; 011047 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){ 011048 i64 notUsed; 011049 if( aRoot[i]==0 ) continue; 011050 #ifndef SQLITE_OMIT_AUTOVACUUM 011051 if( pBt->autoVacuum && aRoot[i]>1 && !bPartial ){ 011052 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0); 011053 } 011054 #endif 011055 sCheck.v0 = aRoot[i]; 011056 checkTreePage(&sCheck, aRoot[i], ¬Used, LARGEST_INT64); 011057 } 011058 pBt->db->flags = savedDbFlags; 011059 011060 /* Make sure every page in the file is referenced 011061 */ 011062 if( !bPartial ){ 011063 for(i=1; i<=sCheck.nCkPage && sCheck.mxErr; i++){ 011064 #ifdef SQLITE_OMIT_AUTOVACUUM 011065 if( getPageReferenced(&sCheck, i)==0 ){ 011066 checkAppendMsg(&sCheck, "Page %u: never used", i); 011067 } 011068 #else 011069 /* If the database supports auto-vacuum, make sure no tables contain 011070 ** references to pointer-map pages. 011071 */ 011072 if( getPageReferenced(&sCheck, i)==0 && 011073 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ 011074 checkAppendMsg(&sCheck, "Page %u: never used", i); 011075 } 011076 if( getPageReferenced(&sCheck, i)!=0 && 011077 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ 011078 checkAppendMsg(&sCheck, "Page %u: pointer map referenced", i); 011079 } 011080 #endif 011081 } 011082 } 011083 011084 /* Clean up and report errors. 011085 */ 011086 integrity_ck_cleanup: 011087 sqlite3PageFree(sCheck.heap); 011088 sqlite3_free(sCheck.aPgRef); 011089 *pnErr = sCheck.nErr; 011090 if( sCheck.nErr==0 ){ 011091 sqlite3_str_reset(&sCheck.errMsg); 011092 *pzOut = 0; 011093 }else{ 011094 *pzOut = sqlite3StrAccumFinish(&sCheck.errMsg); 011095 } 011096 /* Make sure this analysis did not leave any unref() pages. */ 011097 assert( nRef==sqlite3PagerRefcount(pBt->pPager) ); 011098 sqlite3BtreeLeave(p); 011099 return sCheck.rc; 011100 } 011101 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 011102 011103 /* 011104 ** Return the full pathname of the underlying database file. Return 011105 ** an empty string if the database is in-memory or a TEMP database. 011106 ** 011107 ** The pager filename is invariant as long as the pager is 011108 ** open so it is safe to access without the BtShared mutex. 011109 */ 011110 const char *sqlite3BtreeGetFilename(Btree *p){ 011111 assert( p->pBt->pPager!=0 ); 011112 return sqlite3PagerFilename(p->pBt->pPager, 1); 011113 } 011114 011115 /* 011116 ** Return the pathname of the journal file for this database. The return 011117 ** value of this routine is the same regardless of whether the journal file 011118 ** has been created or not. 011119 ** 011120 ** The pager journal filename is invariant as long as the pager is 011121 ** open so it is safe to access without the BtShared mutex. 011122 */ 011123 const char *sqlite3BtreeGetJournalname(Btree *p){ 011124 assert( p->pBt->pPager!=0 ); 011125 return sqlite3PagerJournalname(p->pBt->pPager); 011126 } 011127 011128 /* 011129 ** Return one of SQLITE_TXN_NONE, SQLITE_TXN_READ, or SQLITE_TXN_WRITE 011130 ** to describe the current transaction state of Btree p. 011131 */ 011132 int sqlite3BtreeTxnState(Btree *p){ 011133 assert( p==0 || sqlite3_mutex_held(p->db->mutex) ); 011134 return p ? p->inTrans : 0; 011135 } 011136 011137 #ifndef SQLITE_OMIT_WAL 011138 /* 011139 ** Run a checkpoint on the Btree passed as the first argument. 011140 ** 011141 ** Return SQLITE_LOCKED if this or any other connection has an open 011142 ** transaction on the shared-cache the argument Btree is connected to. 011143 ** 011144 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART. 011145 */ 011146 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){ 011147 int rc = SQLITE_OK; 011148 if( p ){ 011149 BtShared *pBt = p->pBt; 011150 sqlite3BtreeEnter(p); 011151 if( pBt->inTransaction!=TRANS_NONE ){ 011152 rc = SQLITE_LOCKED; 011153 }else{ 011154 rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt); 011155 } 011156 sqlite3BtreeLeave(p); 011157 } 011158 return rc; 011159 } 011160 #endif 011161 011162 /* 011163 ** Return true if there is currently a backup running on Btree p. 011164 */ 011165 int sqlite3BtreeIsInBackup(Btree *p){ 011166 assert( p ); 011167 assert( sqlite3_mutex_held(p->db->mutex) ); 011168 return p->nBackup!=0; 011169 } 011170 011171 /* 011172 ** This function returns a pointer to a blob of memory associated with 011173 ** a single shared-btree. The memory is used by client code for its own 011174 ** purposes (for example, to store a high-level schema associated with 011175 ** the shared-btree). The btree layer manages reference counting issues. 011176 ** 011177 ** The first time this is called on a shared-btree, nBytes bytes of memory 011178 ** are allocated, zeroed, and returned to the caller. For each subsequent 011179 ** call the nBytes parameter is ignored and a pointer to the same blob 011180 ** of memory returned. 011181 ** 011182 ** If the nBytes parameter is 0 and the blob of memory has not yet been 011183 ** allocated, a null pointer is returned. If the blob has already been 011184 ** allocated, it is returned as normal. 011185 ** 011186 ** Just before the shared-btree is closed, the function passed as the 011187 ** xFree argument when the memory allocation was made is invoked on the 011188 ** blob of allocated memory. The xFree function should not call sqlite3_free() 011189 ** on the memory, the btree layer does that. 011190 */ 011191 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ 011192 BtShared *pBt = p->pBt; 011193 sqlite3BtreeEnter(p); 011194 if( !pBt->pSchema && nBytes ){ 011195 pBt->pSchema = sqlite3DbMallocZero(0, nBytes); 011196 pBt->xFreeSchema = xFree; 011197 } 011198 sqlite3BtreeLeave(p); 011199 return pBt->pSchema; 011200 } 011201 011202 /* 011203 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared 011204 ** btree as the argument handle holds an exclusive lock on the 011205 ** sqlite_schema table. Otherwise SQLITE_OK. 011206 */ 011207 int sqlite3BtreeSchemaLocked(Btree *p){ 011208 int rc; 011209 assert( sqlite3_mutex_held(p->db->mutex) ); 011210 sqlite3BtreeEnter(p); 011211 rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK); 011212 assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE ); 011213 sqlite3BtreeLeave(p); 011214 return rc; 011215 } 011216 011217 011218 #ifndef SQLITE_OMIT_SHARED_CACHE 011219 /* 011220 ** Obtain a lock on the table whose root page is iTab. The 011221 ** lock is a write lock if isWritelock is true or a read lock 011222 ** if it is false. 011223 */ 011224 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ 011225 int rc = SQLITE_OK; 011226 assert( p->inTrans!=TRANS_NONE ); 011227 if( p->sharable ){ 011228 u8 lockType = READ_LOCK + isWriteLock; 011229 assert( READ_LOCK+1==WRITE_LOCK ); 011230 assert( isWriteLock==0 || isWriteLock==1 ); 011231 011232 sqlite3BtreeEnter(p); 011233 rc = querySharedCacheTableLock(p, iTab, lockType); 011234 if( rc==SQLITE_OK ){ 011235 rc = setSharedCacheTableLock(p, iTab, lockType); 011236 } 011237 sqlite3BtreeLeave(p); 011238 } 011239 return rc; 011240 } 011241 #endif 011242 011243 #ifndef SQLITE_OMIT_INCRBLOB 011244 /* 011245 ** Argument pCsr must be a cursor opened for writing on an 011246 ** INTKEY table currently pointing at a valid table entry. 011247 ** This function modifies the data stored as part of that entry. 011248 ** 011249 ** Only the data content may only be modified, it is not possible to 011250 ** change the length of the data stored. If this function is called with 011251 ** parameters that attempt to write past the end of the existing data, 011252 ** no modifications are made and SQLITE_CORRUPT is returned. 011253 */ 011254 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ 011255 int rc; 011256 assert( cursorOwnsBtShared(pCsr) ); 011257 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) ); 011258 assert( pCsr->curFlags & BTCF_Incrblob ); 011259 011260 rc = restoreCursorPosition(pCsr); 011261 if( rc!=SQLITE_OK ){ 011262 return rc; 011263 } 011264 assert( pCsr->eState!=CURSOR_REQUIRESEEK ); 011265 if( pCsr->eState!=CURSOR_VALID ){ 011266 return SQLITE_ABORT; 011267 } 011268 011269 /* Save the positions of all other cursors open on this table. This is 011270 ** required in case any of them are holding references to an xFetch 011271 ** version of the b-tree page modified by the accessPayload call below. 011272 ** 011273 ** Note that pCsr must be open on a INTKEY table and saveCursorPosition() 011274 ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence 011275 ** saveAllCursors can only return SQLITE_OK. 011276 */ 011277 VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr); 011278 assert( rc==SQLITE_OK ); 011279 011280 /* Check some assumptions: 011281 ** (a) the cursor is open for writing, 011282 ** (b) there is a read/write transaction open, 011283 ** (c) the connection holds a write-lock on the table (if required), 011284 ** (d) there are no conflicting read-locks, and 011285 ** (e) the cursor points at a valid row of an intKey table. 011286 */ 011287 if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){ 011288 return SQLITE_READONLY; 011289 } 011290 assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0 011291 && pCsr->pBt->inTransaction==TRANS_WRITE ); 011292 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) ); 011293 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) ); 011294 assert( pCsr->pPage->intKey ); 011295 011296 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1); 011297 } 011298 011299 /* 011300 ** Mark this cursor as an incremental blob cursor. 011301 */ 011302 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){ 011303 pCur->curFlags |= BTCF_Incrblob; 011304 pCur->pBtree->hasIncrblobCur = 1; 011305 } 011306 #endif 011307 011308 /* 011309 ** Set both the "read version" (single byte at byte offset 18) and 011310 ** "write version" (single byte at byte offset 19) fields in the database 011311 ** header to iVersion. 011312 */ 011313 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){ 011314 BtShared *pBt = pBtree->pBt; 011315 int rc; /* Return code */ 011316 011317 assert( iVersion==1 || iVersion==2 ); 011318 011319 /* If setting the version fields to 1, do not automatically open the 011320 ** WAL connection, even if the version fields are currently set to 2. 011321 */ 011322 pBt->btsFlags &= ~BTS_NO_WAL; 011323 if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL; 011324 011325 rc = sqlite3BtreeBeginTrans(pBtree, 0, 0); 011326 if( rc==SQLITE_OK ){ 011327 u8 *aData = pBt->pPage1->aData; 011328 if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){ 011329 rc = sqlite3BtreeBeginTrans(pBtree, 2, 0); 011330 if( rc==SQLITE_OK ){ 011331 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 011332 if( rc==SQLITE_OK ){ 011333 aData[18] = (u8)iVersion; 011334 aData[19] = (u8)iVersion; 011335 } 011336 } 011337 } 011338 } 011339 011340 pBt->btsFlags &= ~BTS_NO_WAL; 011341 return rc; 011342 } 011343 011344 /* 011345 ** Return true if the cursor has a hint specified. This routine is 011346 ** only used from within assert() statements 011347 */ 011348 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){ 011349 return (pCsr->hints & mask)!=0; 011350 } 011351 011352 /* 011353 ** Return true if the given Btree is read-only. 011354 */ 011355 int sqlite3BtreeIsReadonly(Btree *p){ 011356 return (p->pBt->btsFlags & BTS_READ_ONLY)!=0; 011357 } 011358 011359 /* 011360 ** Return the size of the header added to each page by this module. 011361 */ 011362 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); } 011363 011364 /* 011365 ** If no transaction is active and the database is not a temp-db, clear 011366 ** the in-memory pager cache. 011367 */ 011368 void sqlite3BtreeClearCache(Btree *p){ 011369 BtShared *pBt = p->pBt; 011370 if( pBt->inTransaction==TRANS_NONE ){ 011371 sqlite3PagerClearCache(pBt->pPager); 011372 } 011373 } 011374 011375 #if !defined(SQLITE_OMIT_SHARED_CACHE) 011376 /* 011377 ** Return true if the Btree passed as the only argument is sharable. 011378 */ 011379 int sqlite3BtreeSharable(Btree *p){ 011380 return p->sharable; 011381 } 011382 011383 /* 011384 ** Return the number of connections to the BtShared object accessed by 011385 ** the Btree handle passed as the only argument. For private caches 011386 ** this is always 1. For shared caches it may be 1 or greater. 011387 */ 011388 int sqlite3BtreeConnectionCount(Btree *p){ 011389 testcase( p->sharable ); 011390 return p->pBt->nRef; 011391 } 011392 #endif