summaryrefslogtreecommitdiff
path: root/src/backend/storage/smgr/md.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage/smgr/md.c')
-rw-r--r--src/backend/storage/smgr/md.c1104
1 files changed, 0 insertions, 1104 deletions
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
deleted file mode 100644
index 978d85d4868..00000000000
--- a/src/backend/storage/smgr/md.c
+++ /dev/null
@@ -1,1104 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * md.c
- * This code manages relations that reside on magnetic disk.
- *
- * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.91 2002/06/20 20:29:35 momjian Exp $
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-
-#include <errno.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/file.h>
-
-#include "catalog/catalog.h"
-#include "miscadmin.h"
-#include "storage/smgr.h"
-#include "utils/inval.h"
-#include "utils/memutils.h"
-
-
-#undef DIAGNOSTIC
-
-/*
- * The magnetic disk storage manager keeps track of open file descriptors
- * in its own descriptor pool. This happens for two reasons. First, at
- * transaction boundaries, we walk the list of descriptors and flush
- * anything that we've dirtied in the current transaction. Second, we want
- * to support relations larger than the OS' file size limit (often 2GBytes).
- * In order to do that, we break relations up into chunks of < 2GBytes
- * and store one chunk in each of several files that represent the relation.
- * See the BLCKSZ and RELSEG_SIZE configuration constants in include/pg_config.h.
- *
- * The file descriptor stored in the relation cache (see RelationGetFile())
- * is actually an index into the Md_fdvec array. -1 indicates not open.
- *
- * When a relation is broken into multiple chunks, only the first chunk
- * has its own entry in the Md_fdvec array; the remaining chunks have
- * palloc'd MdfdVec objects that are chained onto the first chunk via the
- * mdfd_chain links. All chunks except the last MUST have size exactly
- * equal to RELSEG_SIZE blocks --- see mdnblocks() and mdtruncate().
- */
-
-typedef struct _MdfdVec
-{
- int mdfd_vfd; /* fd number in vfd pool */
- int mdfd_flags; /* fd status flags */
-
-/* these are the assigned bits in mdfd_flags: */
-#define MDFD_FREE (1 << 0) /* unused entry */
-
- int mdfd_nextFree; /* link to next freelist member, if free */
-#ifndef LET_OS_MANAGE_FILESIZE
- struct _MdfdVec *mdfd_chain; /* for large relations */
-#endif
-} MdfdVec;
-
-static int Nfds = 100; /* initial/current size of Md_fdvec array */
-static MdfdVec *Md_fdvec = (MdfdVec *) NULL;
-static int Md_Free = -1; /* head of freelist of unused fdvec
- * entries */
-static int CurFd = 0; /* first never-used fdvec index */
-static MemoryContext MdCxt; /* context for all my allocations */
-
-/* routines declared here */
-static void mdclose_fd(int fd);
-static int _mdfd_getrelnfd(Relation reln);
-static MdfdVec *_mdfd_openseg(Relation reln, BlockNumber segno, int oflags);
-static MdfdVec *_mdfd_getseg(Relation reln, BlockNumber blkno);
-
-static int _mdfd_blind_getseg(RelFileNode rnode, BlockNumber blkno);
-
-static int _fdvec_alloc(void);
-static void _fdvec_free(int);
-static BlockNumber _mdnblocks(File file, Size blcksz);
-
-/*
- * mdinit() -- Initialize private state for magnetic disk storage manager.
- *
- * We keep a private table of all file descriptors. Whenever we do
- * a write to one, we mark it dirty in our table. Whenever we force
- * changes to disk, we mark the file descriptor clean. At transaction
- * commit, we force changes to disk for all dirty file descriptors.
- * This routine allocates and initializes the table.
- *
- * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
- */
-int
-mdinit(void)
-{
- int i;
-
- MdCxt = AllocSetContextCreate(TopMemoryContext,
- "MdSmgr",
- ALLOCSET_DEFAULT_MINSIZE,
- ALLOCSET_DEFAULT_INITSIZE,
- ALLOCSET_DEFAULT_MAXSIZE);
-
- Md_fdvec = (MdfdVec *) MemoryContextAlloc(MdCxt, Nfds * sizeof(MdfdVec));
-
- MemSet(Md_fdvec, 0, Nfds * sizeof(MdfdVec));
-
- /* Set free list */
- for (i = 0; i < Nfds; i++)
- {
- Md_fdvec[i].mdfd_nextFree = i + 1;
- Md_fdvec[i].mdfd_flags = MDFD_FREE;
- }
- Md_Free = 0;
- Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
-
- return SM_SUCCESS;
-}
-
-int
-mdcreate(Relation reln)
-{
- char *path;
- int fd,
- vfd;
-
- Assert(reln->rd_fd < 0);
-
- path = relpath(reln->rd_node);
-
- fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
-
- if (fd < 0)
- {
- int save_errno = errno;
-
- /*
- * During bootstrap, there are cases where a system relation will
- * be accessed (by internal backend processes) before the
- * bootstrap script nominally creates it. Therefore, allow the
- * file to exist already, but in bootstrap mode only. (See also
- * mdopen)
- */
- if (IsBootstrapProcessingMode())
- fd = FileNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
- if (fd < 0)
- {
- pfree(path);
- /* be sure to return the error reported by create, not open */
- errno = save_errno;
- return -1;
- }
- errno = 0;
- }
-
- pfree(path);
-
- vfd = _fdvec_alloc();
- if (vfd < 0)
- return -1;
-
- Md_fdvec[vfd].mdfd_vfd = fd;
- Md_fdvec[vfd].mdfd_flags = (uint16) 0;
-#ifndef LET_OS_MANAGE_FILESIZE
- Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
-#endif
-
- return vfd;
-}
-
-/*
- * mdunlink() -- Unlink a relation.
- */
-int
-mdunlink(RelFileNode rnode)
-{
- int status = SM_SUCCESS;
- int save_errno = 0;
- char *path;
-
- path = relpath(rnode);
-
- /* Delete the first segment, or only segment if not doing segmenting */
- if (unlink(path) < 0)
- {
- status = SM_FAIL;
- save_errno = errno;
- }
-
-#ifndef LET_OS_MANAGE_FILESIZE
- /* Get the additional segments, if any */
- if (status == SM_SUCCESS)
- {
- char *segpath = (char *) palloc(strlen(path) + 12);
- BlockNumber segno;
-
- for (segno = 1;; segno++)
- {
- sprintf(segpath, "%s.%u", path, segno);
- if (unlink(segpath) < 0)
- {
- /* ENOENT is expected after the last segment... */
- if (errno != ENOENT)
- {
- status = SM_FAIL;
- save_errno = errno;
- }
- break;
- }
- }
- pfree(segpath);
- }
-#endif
-
- pfree(path);
-
- errno = save_errno;
- return status;
-}
-
-/*
- * mdextend() -- Add a block to the specified relation.
- *
- * The semantics are basically the same as mdwrite(): write at the
- * specified position. However, we are expecting to extend the
- * relation (ie, blocknum is the current EOF), and so in case of
- * failure we clean up by truncating.
- *
- * This routine returns SM_FAIL or SM_SUCCESS, with errno set as
- * appropriate.
- *
- * Note: this routine used to call mdnblocks() to get the block position
- * to write at, but that's pretty silly since the caller needs to know where
- * the block will be written, and accordingly must have done mdnblocks()
- * already. Might as well pass in the position and save a seek.
- */
-int
-mdextend(Relation reln, BlockNumber blocknum, char *buffer)
-{
- long seekpos;
- int nbytes;
- MdfdVec *v;
-
- v = _mdfd_getseg(reln, blocknum);
-
-#ifndef LET_OS_MANAGE_FILESIZE
- seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
-#ifdef DIAGNOSTIC
- if (seekpos >= BLCKSZ * RELSEG_SIZE)
- elog(FATAL, "seekpos too big!");
-#endif
-#else
- seekpos = (long) (BLCKSZ * (blocknum));
-#endif
-
- /*
- * Note: because caller obtained blocknum by calling mdnblocks, which
- * did a seek(SEEK_END), this seek is often redundant and will be
- * optimized away by fd.c. It's not redundant, however, if there is a
- * partial page at the end of the file. In that case we want to try
- * to overwrite the partial page with a full page. It's also not
- * redundant if bufmgr.c had to dump another buffer of the same file
- * to make room for the new page's buffer.
- */
- if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
- return SM_FAIL;
-
- if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
- {
- if (nbytes > 0)
- {
- int save_errno = errno;
-
- /* Remove the partially-written page */
- FileTruncate(v->mdfd_vfd, seekpos);
- FileSeek(v->mdfd_vfd, seekpos, SEEK_SET);
- errno = save_errno;
- }
- return SM_FAIL;
- }
-
-#ifndef LET_OS_MANAGE_FILESIZE
-#ifdef DIAGNOSTIC
- if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > ((BlockNumber) RELSEG_SIZE))
- elog(FATAL, "segment too big!");
-#endif
-#endif
-
- return SM_SUCCESS;
-}
-
-/*
- * mdopen() -- Open the specified relation.
- */
-int
-mdopen(Relation reln)
-{
- char *path;
- int fd;
- int vfd;
-
- Assert(reln->rd_fd < 0);
-
- path = relpath(reln->rd_node);
-
- fd = FileNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
-
- if (fd < 0)
- {
- /*
- * During bootstrap, there are cases where a system relation will
- * be accessed (by internal backend processes) before the
- * bootstrap script nominally creates it. Therefore, accept
- * mdopen() as a substitute for mdcreate() in bootstrap mode only.
- * (See mdcreate)
- */
- if (IsBootstrapProcessingMode())
- fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
- if (fd < 0)
- {
- pfree(path);
- return -1;
- }
- }
-
- pfree(path);
-
- vfd = _fdvec_alloc();
- if (vfd < 0)
- return -1;
-
- Md_fdvec[vfd].mdfd_vfd = fd;
- Md_fdvec[vfd].mdfd_flags = (uint16) 0;
-#ifndef LET_OS_MANAGE_FILESIZE
- Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
-
-#ifdef DIAGNOSTIC
- if (_mdnblocks(fd, BLCKSZ) > ((BlockNumber) RELSEG_SIZE))
- elog(FATAL, "segment too big on relopen!");
-#endif
-#endif
-
- return vfd;
-}
-
-/*
- * mdclose() -- Close the specified relation, if it isn't closed already.
- *
- * AND FREE fd vector! It may be re-used for other relation!
- * reln should be flushed from cache after closing !..
- *
- * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
- */
-int
-mdclose(Relation reln)
-{
- int fd;
-
- fd = RelationGetFile(reln);
- if (fd < 0)
- return SM_SUCCESS; /* already closed, so no work */
-
- mdclose_fd(fd);
-
- reln->rd_fd = -1;
-
- return SM_SUCCESS;
-}
-
-static void
-mdclose_fd(int fd)
-{
- MdfdVec *v;
-
-#ifndef LET_OS_MANAGE_FILESIZE
- for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
- {
- MdfdVec *ov = v;
-
- /* if not closed already */
- if (v->mdfd_vfd >= 0)
- {
- /*
- * We sync the file descriptor so that we don't need to reopen
- * it at transaction commit to force changes to disk. (This
- * is not really optional, because we are about to forget that
- * the file even exists...)
- */
- FileSync(v->mdfd_vfd);
- FileClose(v->mdfd_vfd);
- }
- /* Now free vector */
- v = v->mdfd_chain;
- if (ov != &Md_fdvec[fd])
- pfree(ov);
- }
-
- Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
-#else
- v = &Md_fdvec[fd];
- if (v != (MdfdVec *) NULL)
- {
- if (v->mdfd_vfd >= 0)
- {
- /*
- * We sync the file descriptor so that we don't need to reopen
- * it at transaction commit to force changes to disk. (This
- * is not really optional, because we are about to forget that
- * the file even exists...)
- */
- FileSync(v->mdfd_vfd);
- FileClose(v->mdfd_vfd);
- }
- }
-#endif
-
- _fdvec_free(fd);
-}
-
-/*
- * mdread() -- Read the specified block from a relation.
- *
- * Returns SM_SUCCESS or SM_FAIL.
- */
-int
-mdread(Relation reln, BlockNumber blocknum, char *buffer)
-{
- int status;
- long seekpos;
- int nbytes;
- MdfdVec *v;
-
- v = _mdfd_getseg(reln, blocknum);
-
-#ifndef LET_OS_MANAGE_FILESIZE
- seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
-
-#ifdef DIAGNOSTIC
- if (seekpos >= BLCKSZ * RELSEG_SIZE)
- elog(FATAL, "seekpos too big!");
-#endif
-#else
- seekpos = (long) (BLCKSZ * (blocknum));
-#endif
-
- if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
- return SM_FAIL;
-
- status = SM_SUCCESS;
- if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
- {
- /*
- * If we are at EOF, return zeroes without complaining. (XXX Is
- * this still necessary/a good idea??)
- */
- if (nbytes == 0 ||
- (nbytes > 0 && mdnblocks(reln) == blocknum))
- MemSet(buffer, 0, BLCKSZ);
- else
- status = SM_FAIL;
- }
-
- return status;
-}
-
-/*
- * mdwrite() -- Write the supplied block at the appropriate location.
- *
- * Returns SM_SUCCESS or SM_FAIL.
- */
-int
-mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
-{
- long seekpos;
- MdfdVec *v;
-
- v = _mdfd_getseg(reln, blocknum);
-
-#ifndef LET_OS_MANAGE_FILESIZE
- seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
-#ifdef DIAGNOSTIC
- if (seekpos >= BLCKSZ * RELSEG_SIZE)
- elog(FATAL, "seekpos too big!");
-#endif
-#else
- seekpos = (long) (BLCKSZ * (blocknum));
-#endif
-
- if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
- return SM_FAIL;
-
- if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
- return SM_FAIL;
-
- return SM_SUCCESS;
-}
-
-/*
- * mdflush() -- Synchronously write a block to disk.
- *
- * This is exactly like mdwrite(), but doesn't return until the file
- * system buffer cache has been flushed.
- */
-int
-mdflush(Relation reln, BlockNumber blocknum, char *buffer)
-{
- int status;
- long seekpos;
- MdfdVec *v;
-
- v = _mdfd_getseg(reln, blocknum);
-
-#ifndef LET_OS_MANAGE_FILESIZE
- seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
-#ifdef DIAGNOSTIC
- if (seekpos >= BLCKSZ * RELSEG_SIZE)
- elog(FATAL, "seekpos too big!");
-#endif
-#else
- seekpos = (long) (BLCKSZ * (blocknum));
-#endif
-
- if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
- return SM_FAIL;
-
- /* write and sync the block */
- status = SM_SUCCESS;
- if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
- || FileSync(v->mdfd_vfd) < 0)
- status = SM_FAIL;
-
- return status;
-}
-
-/*
- * mdblindwrt() -- Write a block to disk blind.
- *
- * We have to be able to do this using only the name and OID of
- * the database and relation in which the block belongs. Otherwise
- * this is much like mdwrite(). If dofsync is TRUE, then we fsync
- * the file, making it more like mdflush().
- */
-int
-mdblindwrt(RelFileNode rnode,
- BlockNumber blkno,
- char *buffer,
- bool dofsync)
-{
- int status;
- long seekpos;
- int fd;
-
- fd = _mdfd_blind_getseg(rnode, blkno);
-
- if (fd < 0)
- return SM_FAIL;
-
-#ifndef LET_OS_MANAGE_FILESIZE
- seekpos = (long) (BLCKSZ * (blkno % ((BlockNumber) RELSEG_SIZE)));
-#ifdef DIAGNOSTIC
- if (seekpos >= BLCKSZ * RELSEG_SIZE)
- elog(FATAL, "seekpos too big!");
-#endif
-#else
- seekpos = (long) (BLCKSZ * (blkno));
-#endif
-
- errno = 0;
-
- if (lseek(fd, seekpos, SEEK_SET) != seekpos)
- {
- elog(LOG, "mdblindwrt: lseek(%ld) failed: %m", seekpos);
- close(fd);
- return SM_FAIL;
- }
-
- status = SM_SUCCESS;
-
- /* write and optionally sync the block */
- errno = 0;
- if (write(fd, buffer, BLCKSZ) != BLCKSZ)
- {
- /* if write didn't set errno, assume problem is no disk space */
- if (errno == 0)
- errno = ENOSPC;
- elog(LOG, "mdblindwrt: write() failed: %m");
- status = SM_FAIL;
- }
-
- if (close(fd) < 0)
- {
- elog(LOG, "mdblindwrt: close() failed: %m");
- status = SM_FAIL;
- }
-
- return status;
-}
-
-/*
- * mdmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync).
- *
- * Returns SM_SUCCESS or SM_FAIL.
- */
-int
-mdmarkdirty(Relation reln, BlockNumber blkno)
-{
- MdfdVec *v;
-
- v = _mdfd_getseg(reln, blkno);
-
- FileMarkDirty(v->mdfd_vfd);
-
- return SM_SUCCESS;
-}
-
-/*
- * mdblindmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync).
- *
- * We have to be able to do this using only the name and OID of
- * the database and relation in which the block belongs. Otherwise
- * this is much like mdmarkdirty(). However, we do the fsync immediately
- * rather than building md/fd datastructures to postpone it till later.
- */
-int
-mdblindmarkdirty(RelFileNode rnode,
- BlockNumber blkno)
-{
- int status;
- int fd;
-
- fd = _mdfd_blind_getseg(rnode, blkno);
-
- if (fd < 0)
- return SM_FAIL;
-
- status = SM_SUCCESS;
-
- if (pg_fsync(fd) < 0)
- status = SM_FAIL;
-
- if (close(fd) < 0)
- status = SM_FAIL;
-
- return status;
-}
-
-/*
- * mdnblocks() -- Get the number of blocks stored in a relation.
- *
- * Important side effect: all segments of the relation are opened
- * and added to the mdfd_chain list. If this routine has not been
- * called, then only segments up to the last one actually touched
- * are present in the chain...
- *
- * Returns # of blocks, elog's on error.
- */
-BlockNumber
-mdnblocks(Relation reln)
-{
- int fd;
- MdfdVec *v;
-
-#ifndef LET_OS_MANAGE_FILESIZE
- BlockNumber nblocks;
- BlockNumber segno;
-#endif
-
- fd = _mdfd_getrelnfd(reln);
- v = &Md_fdvec[fd];
-
-#ifndef LET_OS_MANAGE_FILESIZE
- segno = 0;
- for (;;)
- {
- nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ);
- if (nblocks > ((BlockNumber) RELSEG_SIZE))
- elog(FATAL, "segment too big in mdnblocks!");
- if (nblocks < ((BlockNumber) RELSEG_SIZE))
- return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
-
- /*
- * If segment is exactly RELSEG_SIZE, advance to next one.
- */
- segno++;
-
- if (v->mdfd_chain == (MdfdVec *) NULL)
- {
- /*
- * Because we pass O_CREAT, we will create the next segment
- * (with zero length) immediately, if the last segment is of
- * length REL_SEGSIZE. This is unnecessary but harmless, and
- * testing for the case would take more cycles than it seems
- * worth.
- */
- v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
- if (v->mdfd_chain == (MdfdVec *) NULL)
- elog(ERROR, "cannot count blocks for %s -- open failed: %m",
- RelationGetRelationName(reln));
- }
-
- v = v->mdfd_chain;
- }
-#else
- return _mdnblocks(v->mdfd_vfd, BLCKSZ);
-#endif
-}
-
-/*
- * mdtruncate() -- Truncate relation to specified number of blocks.
- *
- * Returns # of blocks or InvalidBlockNumber on error.
- */
-BlockNumber
-mdtruncate(Relation reln, BlockNumber nblocks)
-{
- int fd;
- MdfdVec *v;
- BlockNumber curnblk;
-
-#ifndef LET_OS_MANAGE_FILESIZE
- BlockNumber priorblocks;
-#endif
-
- /*
- * NOTE: mdnblocks makes sure we have opened all existing segments, so
- * that truncate/delete loop will get them all!
- */
- curnblk = mdnblocks(reln);
- if (nblocks > curnblk)
- return InvalidBlockNumber; /* bogus request */
- if (nblocks == curnblk)
- return nblocks; /* no work */
-
- fd = _mdfd_getrelnfd(reln);
- v = &Md_fdvec[fd];
-
-#ifndef LET_OS_MANAGE_FILESIZE
- priorblocks = 0;
- while (v != (MdfdVec *) NULL)
- {
- MdfdVec *ov = v;
-
- if (priorblocks > nblocks)
- {
- /*
- * This segment is no longer wanted at all (and has already
- * been unlinked from the mdfd_chain). We truncate the file
- * before deleting it because if other backends are holding
- * the file open, the unlink will fail on some platforms.
- * Better a zero-size file gets left around than a big file...
- */
- FileTruncate(v->mdfd_vfd, 0);
- FileUnlink(v->mdfd_vfd);
- v = v->mdfd_chain;
- Assert(ov != &Md_fdvec[fd]); /* we never drop the 1st
- * segment */
- pfree(ov);
- }
- else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
- {
- /*
- * This is the last segment we want to keep. Truncate the file
- * to the right length, and clear chain link that points to
- * any remaining segments (which we shall zap). NOTE: if
- * nblocks is exactly a multiple K of RELSEG_SIZE, we will
- * truncate the K+1st segment to 0 length but keep it. This is
- * mainly so that the right thing happens if nblocks==0.
- */
- BlockNumber lastsegblocks = nblocks - priorblocks;
-
- if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
- return InvalidBlockNumber;
- v = v->mdfd_chain;
- ov->mdfd_chain = (MdfdVec *) NULL;
- }
- else
- {
- /*
- * We still need this segment and 0 or more blocks beyond it,
- * so nothing to do here.
- */
- v = v->mdfd_chain;
- }
- priorblocks += RELSEG_SIZE;
- }
-#else
- if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
- return InvalidBlockNumber;
-#endif
-
- return nblocks;
-}
-
-/*
- * mdcommit() -- Commit a transaction.
- *
- * All changes to magnetic disk relations must be forced to stable
- * storage. This routine makes a pass over the private table of
- * file descriptors. Any descriptors to which we have done writes,
- * but not synced, are synced here.
- *
- * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
- */
-int
-mdcommit()
-{
- int i;
- MdfdVec *v;
-
- for (i = 0; i < CurFd; i++)
- {
- v = &Md_fdvec[i];
- if (v->mdfd_flags & MDFD_FREE)
- continue;
- /* Sync the file entry */
-#ifndef LET_OS_MANAGE_FILESIZE
- for (; v != (MdfdVec *) NULL; v = v->mdfd_chain)
-#else
- if (v != (MdfdVec *) NULL)
-#endif
- {
- if (FileSync(v->mdfd_vfd) < 0)
- return SM_FAIL;
- }
- }
-
- return SM_SUCCESS;
-}
-
-/*
- * mdabort() -- Abort a transaction.
- *
- * Changes need not be forced to disk at transaction abort. We mark
- * all file descriptors as clean here. Always returns SM_SUCCESS.
- */
-int
-mdabort()
-{
- /*
- * We don't actually have to do anything here. fd.c will discard
- * fsync-needed bits in its AtEOXact_Files() routine.
- */
- return SM_SUCCESS;
-}
-
-/*
- * mdsync() -- Sync storage.
- *
- */
-int
-mdsync()
-{
- sync();
- if (IsUnderPostmaster)
- sleep(2);
- sync();
- return SM_SUCCESS;
-}
-
-/*
- * _fdvec_alloc () -- grab a free (or new) md file descriptor vector.
- *
- */
-static
-int
-_fdvec_alloc()
-{
- MdfdVec *nvec;
- int fdvec,
- i;
-
- if (Md_Free >= 0) /* get from free list */
- {
- fdvec = Md_Free;
- Md_Free = Md_fdvec[fdvec].mdfd_nextFree;
- Assert(Md_fdvec[fdvec].mdfd_flags == MDFD_FREE);
- Md_fdvec[fdvec].mdfd_flags = 0;
- if (fdvec >= CurFd)
- {
- Assert(fdvec == CurFd);
- CurFd++;
- }
- return fdvec;
- }
-
- /* Must allocate more room */
-
- if (Nfds != CurFd)
- elog(FATAL, "_fdvec_alloc error");
-
- Nfds *= 2;
-
- nvec = (MdfdVec *) MemoryContextAlloc(MdCxt, Nfds * sizeof(MdfdVec));
- MemSet(nvec, 0, Nfds * sizeof(MdfdVec));
- memcpy(nvec, (char *) Md_fdvec, CurFd * sizeof(MdfdVec));
- pfree(Md_fdvec);
-
- Md_fdvec = nvec;
-
- /* Set new free list */
- for (i = CurFd; i < Nfds; i++)
- {
- Md_fdvec[i].mdfd_nextFree = i + 1;
- Md_fdvec[i].mdfd_flags = MDFD_FREE;
- }
- Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
- Md_Free = CurFd + 1;
-
- fdvec = CurFd;
- CurFd++;
- Md_fdvec[fdvec].mdfd_flags = 0;
-
- return fdvec;
-}
-
-/*
- * _fdvec_free () -- free md file descriptor vector.
- *
- */
-static
-void
-_fdvec_free(int fdvec)
-{
-
- Assert(Md_Free < 0 || Md_fdvec[Md_Free].mdfd_flags == MDFD_FREE);
- Assert(Md_fdvec[fdvec].mdfd_flags != MDFD_FREE);
- Md_fdvec[fdvec].mdfd_nextFree = Md_Free;
- Md_fdvec[fdvec].mdfd_flags = MDFD_FREE;
- Md_Free = fdvec;
-}
-
-static MdfdVec *
-_mdfd_openseg(Relation reln, BlockNumber segno, int oflags)
-{
- MdfdVec *v;
- int fd;
- char *path,
- *fullpath;
-
- /* be sure we have enough space for the '.segno', if any */
- path = relpath(reln->rd_node);
-
- if (segno > 0)
- {
- fullpath = (char *) palloc(strlen(path) + 12);
- sprintf(fullpath, "%s.%u", path, segno);
- pfree(path);
- }
- else
- fullpath = path;
-
- /* open the file */
- fd = FileNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
-
- pfree(fullpath);
-
- if (fd < 0)
- return (MdfdVec *) NULL;
-
- /* allocate an mdfdvec entry for it */
- v = (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
-
- /* fill the entry */
- v->mdfd_vfd = fd;
- v->mdfd_flags = (uint16) 0;
-#ifndef LET_OS_MANAGE_FILESIZE
- v->mdfd_chain = (MdfdVec *) NULL;
-
-#ifdef DIAGNOSTIC
- if (_mdnblocks(fd, BLCKSZ) > ((BlockNumber) RELSEG_SIZE))
- elog(FATAL, "segment too big on openseg!");
-#endif
-#endif
-
- /* all done */
- return v;
-}
-
-/* Get the fd for the relation, opening it if it's not already open */
-
-static int
-_mdfd_getrelnfd(Relation reln)
-{
- int fd;
-
- fd = RelationGetFile(reln);
- if (fd < 0)
- {
- if ((fd = mdopen(reln)) < 0)
- elog(ERROR, "_mdfd_getrelnfd: cannot open relation %s: %m",
- RelationGetRelationName(reln));
- reln->rd_fd = fd;
- }
- return fd;
-}
-
-/* Find the segment of the relation holding the specified block */
-
-static MdfdVec *
-_mdfd_getseg(Relation reln, BlockNumber blkno)
-{
- MdfdVec *v;
- int fd;
-
-#ifndef LET_OS_MANAGE_FILESIZE
- BlockNumber segno;
- BlockNumber i;
-#endif
-
- fd = _mdfd_getrelnfd(reln);
-
-#ifndef LET_OS_MANAGE_FILESIZE
- for (v = &Md_fdvec[fd], segno = blkno / ((BlockNumber) RELSEG_SIZE), i = 1;
- segno > 0;
- i++, segno--)
- {
-
- if (v->mdfd_chain == (MdfdVec *) NULL)
- {
- /*
- * We will create the next segment only if the target block is
- * within it. This prevents Sorcerer's Apprentice syndrome if
- * a bug at higher levels causes us to be handed a
- * ridiculously large blkno --- otherwise we could create many
- * thousands of empty segment files before reaching the
- * "target" block. We should never need to create more than
- * one new segment per call, so this restriction seems
- * reasonable.
- */
- v->mdfd_chain = _mdfd_openseg(reln, i, (segno == 1) ? O_CREAT : 0);
-
- if (v->mdfd_chain == (MdfdVec *) NULL)
- elog(ERROR, "cannot open segment %u of relation %s (target block %u): %m",
- i, RelationGetRelationName(reln), blkno);
- }
- v = v->mdfd_chain;
- }
-#else
- v = &Md_fdvec[fd];
-#endif
-
- return v;
-}
-
-/*
- * Find the segment of the relation holding the specified block.
- *
- * This performs the same work as _mdfd_getseg() except that we must work
- * "blind" with no Relation struct. We assume that we are not likely to
- * touch the same relation again soon, so we do not create an FD entry for
- * the relation --- we just open a kernel file descriptor which will be
- * used and promptly closed. We also assume that the target block already
- * exists, ie, we need not extend the relation.
- *
- * The return value is the kernel descriptor, or -1 on failure.
- */
-
-static int
-_mdfd_blind_getseg(RelFileNode rnode, BlockNumber blkno)
-{
- char *path;
- int fd;
-
-#ifndef LET_OS_MANAGE_FILESIZE
- BlockNumber segno;
-#endif
-
- path = relpath(rnode);
-
-#ifndef LET_OS_MANAGE_FILESIZE
- /* append the '.segno', if needed */
- segno = blkno / ((BlockNumber) RELSEG_SIZE);
- if (segno > 0)
- {
- char *segpath = (char *) palloc(strlen(path) + 12);
-
- sprintf(segpath, "%s.%u", path, segno);
- pfree(path);
- path = segpath;
- }
-#endif
-
- /* call fd.c to allow other FDs to be closed if needed */
- fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0600);
- if (fd < 0)
- elog(LOG, "_mdfd_blind_getseg: couldn't open %s: %m", path);
-
- pfree(path);
-
- return fd;
-}
-
-static BlockNumber
-_mdnblocks(File file, Size blcksz)
-{
- long len;
-
- len = FileSeek(file, 0L, SEEK_END);
- if (len < 0)
- return 0; /* on failure, assume file is empty */
- return (BlockNumber) (len / blcksz);
-}