Skip to content

Instantly share code, notes, and snippets.

@gitfvb
Created June 13, 2025 09:37
Show Gist options
  • Save gitfvb/f920856818d3c0b585baa400d4e18873 to your computer and use it in GitHub Desktop.
Save gitfvb/f920856818d3c0b585baa400d4e18873 to your computer and use it in GitHub Desktop.
Write the last part of a file into a temporary file and use that with DuckDB
################################################
#
# INPUT
#
################################################
#-----------------------------------------------
# DEBUG SWITCH
#-----------------------------------------------
$debug = $false
If ( $PSBoundParameters["Debug"].IsPresent -eq $true ) {
$debug = $true
}
#-----------------------------------------------
# INPUT PARAMETERS, IF DEBUG IS TRUE
#-----------------------------------------------
if ( $debug -eq $true ) {
$params = [hashtable]@{
# Automatic parameters
Password = 'ko'
Username = 'ko'
#mode='taggingOnly'
# Integration parameters
Force64bit = "true"
#ForceCore = "true"
#ForcePython = "true"
#UseJob = "true"
settingsFile = "C:\FastStats\Scripts\AptecoPSFramework\settings\sfnpc2\settings.yaml"
}
}
################################################
#
# SETTINGS
#
################################################
$settingsfileLocation = "C:\faststats\scripts\build\settings.json"
#-----------------------------------------------
# CHANGE PATH
#-----------------------------------------------
# Set current location to the settings files directory
$settingsFileItem = Get-Item $settingsfileLocation
Set-Location $settingsFileItem.DirectoryName
Import-Module WriteLog
Set-Logfile "E:\temp\duck.log"
################################################
#
# PATH
#
################################################
#-----------------------------------------------
# CHECKING PS AND OS
#-----------------------------------------------
Write-Verbose "Check PowerShell and Operating system" -Verbose
# Check if this is Pwsh Core
$isCore = ($PSVersionTable.Keys -contains "PSEdition") -and ($PSVersionTable.PSEdition -ne 'Desktop')
Write-Verbose -Message "Using PowerShell version $( $PSVersionTable.PSVersion.ToString() ) and $( $PSVersionTable.PSEdition ) edition" -Verbose
# Check the operating system, if Core
if ($isCore -eq $true) {
$os = If ( $IsWindows -eq $true ) {
"Windows"
} elseif ( $IsLinux -eq $true ) {
"Linux"
} elseif ( $IsMacOS -eq $true ) {
"MacOS"
} else {
throw "Unknown operating system"
}
} else {
# [System.Environment]::OSVersion.VersionString()
# [System.Environment]::Is64BitOperatingSystem
$os = "Windows"
}
Write-Verbose -Message "Using OS: $( $os )" -Verbose
#-----------------------------------------------
# ADD MODULE PATH, IF NOT PRESENT
#-----------------------------------------------
$modulePath = @( [System.Environment]::GetEnvironmentVariable("PSModulePath") -split ";" ) + @(
"$( [System.Environment]::GetEnvironmentVariable("ProgramFiles") )\WindowsPowerShell\Modules"
"$( [System.Environment]::GetEnvironmentVariable("ProgramFiles(x86)") )\WindowsPowerShell\Modules"
"$( [System.Environment]::GetEnvironmentVariable("USERPROFILE") )\Documents\WindowsPowerShell\Modules"
"$( [System.Environment]::GetEnvironmentVariable("windir") )\system32\WindowsPowerShell\v1.0\Modules"
)
# Add the 64bit path, if present. In 32bit the ProgramFiles variables only returns the x86 path
If ( [System.Environment]::GetEnvironmentVariables().keys -contains "ProgramW6432" ) {
$modulePath += "$( [System.Environment]::GetEnvironmentVariable("ProgramW6432") )\WindowsPowerShell\Modules"
}
# Add pwsh core path
If ( $isCore -eq $true ) {
If ( [System.Environment]::GetEnvironmentVariables().keys -contains "ProgramW6432" ) {
$modulePath += "$( [System.Environment]::GetEnvironmentVariable("ProgramW6432") )\powershell\7\Modules"
}
$modulePath += "$( [System.Environment]::GetEnvironmentVariable("ProgramFiles") )\powershell\7\Modules"
$modulePath += "$( [System.Environment]::GetEnvironmentVariable("ProgramFiles(x86)") )\powershell\7\Modules"
}
# Add all paths
# Using $env:PSModulePath for only temporary override
$Env:PSModulePath = @( $modulePath | Sort-Object -unique ) -join ";"
#-----------------------------------------------
# ADD SCRIPT PATH, IF NOT PRESENT
#-----------------------------------------------
#$envVariables = [System.Environment]::GetEnvironmentVariables()
$scriptPath = @( [System.Environment]::GetEnvironmentVariable("Path") -split ";" ) + @(
"$( [System.Environment]::GetEnvironmentVariable("ProgramFiles") )\WindowsPowerShell\Scripts"
"$( [System.Environment]::GetEnvironmentVariable("ProgramFiles(x86)") )\WindowsPowerShell\Scripts"
"$( [System.Environment]::GetEnvironmentVariable("USERPROFILE") )\Documents\WindowsPowerShell\Scripts"
)
# Add the 64bit path, if present. In 32bit the ProgramFiles variables only returns the x86 path
If ( [System.Environment]::GetEnvironmentVariables().keys -contains "ProgramW6432" ) {
$scriptPath += "$( [System.Environment]::GetEnvironmentVariable("ProgramW6432") )\WindowsPowerShell\Scripts"
}
# Add pwsh core path
If ( $isCore -eq $true ) {
If ( [System.Environment]::GetEnvironmentVariables().keys -contains "ProgramW6432" ) {
$scriptPath += "$( [System.Environment]::GetEnvironmentVariable("ProgramW6432") )\powershell\7\Scripts"
}
$scriptPath += "$( [System.Environment]::GetEnvironmentVariable("ProgramFiles") )\powershell\7\Scripts"
$scriptPath += "$( [System.Environment]::GetEnvironmentVariable("ProgramFiles(x86)") )\powershell\7\Scripts"
}
# Using $env:Path for only temporary override
$Env:Path = @( $scriptPath | Sort-Object -unique ) -join ";"
################################################
#
# CHECKS
#
################################################
#-----------------------------------------------
# DEFAULT VALUES
#-----------------------------------------------
$useJob = $false
$enforce64Bit = $false
$enforceCore = $false
$enforcePython = $false
$isPsCoreInstalled = $false
#-----------------------------------------------
# IMPORT MODULE
#-----------------------------------------------
If ($debug -eq $true) {
Import-Module "AptecoPSFramework" -Verbose
} else {
Import-Module "AptecoPSFramework"
}
#-----------------------------------------------
# SET DEBUG MODE
#-----------------------------------------------
Set-DebugMode -DebugMode $debug
#-----------------------------------------------
# SET SETTINGS
#-----------------------------------------------
# Set the settings
#If ( $useJob -eq $true -and $ProcessId -ne "") {
# Import-Settings -Path $settingsfileLocation -ProcessId $ProcessId
#} else {
# Import-Settings -Path $settingsfileLocation
#}
# Get all settings
$s = Get-Settings
#-----------------------------------------------
# FIND OUT ABOUT PS CORE
#-----------------------------------------------
try {
$calc = . $s.psCoreExePath { 1+1 }
} catch {
# just a test, nothing to do
}
if ( $calc -eq 2 ) {
$isPsCoreInstalled = $true
}
#-----------------------------------------------
# FIND OUT THE MODE
#-----------------------------------------------
$mode = "function"
If ( $enforce64Bit -eq $true ) {
$mode = "PSWin64"
} elseif ( $enforceCore -eq $true ) {
$mode = "PSCore"
} elseif ( $enforcePython -eq $true ) {
$mode = "Python"
}
import-lib
Open-DuckDBConnection
Read-DuckDBQueryAsScalar -Query "Select 10+15"
#Import-Dependencies -LoadWholePackageFolder -LocalPackageFolder ".\lib" -SuppressWarnings #$localLibFolderItem.name
$path = "E:\FastStats\Build\test\Extract\lineitems.txt"
$tempFile = "E:\temp\$( [guid]::NewGuid().toString() ).csv"
$sniffRecords = 10000000
$ernColumn = "ERNTS"
# Write the last n lines into a new file
# 100k rows need 2 seconds
# 1m rows need 18 seconds
# 10m rows need 186 seconds
Write-Log "Writing the last $( $sniffRecords ) rows of '$( $path )' into '$( $tempFile )'"
$t = Measure-Command {
Get-Content -totalcount 1 -Encoding UTF8 -path $path | Set-content -Path $tempFile -encoding utf8
Get-Content -Tail $sniffRecords -Encoding UTF8 -path $path -ReadCount 1000 | Add-Content -Path $tempFile -Encoding UTF8
}
Write-Log "Needed $( $t.TotalSeconds ) to write '$( $tempFile )'"
# Collect data about that file
# 10m rows need about 21 seconds
Write-Log "Reading the temporary file '$( $tempFile )'"
$t = Measure-Command {
$summary = Read-DuckDBQueryAsReader -Query "SUMMARIZE TABLE '$( $tempFile )'" -ReturnAsPSCustom
}
Write-Log "Needed $( $t.TotalSeconds ) to summarize file '$( $tempFile )'"
# Checking the column
Write-Log "Checking the column '$( $ernColumn )' for the max value"
$ernSummary = $summary | where-object { $_."column_name" -eq $ernColumn }
Write-Log " Min: $( $ernSummary.min )"
Write-Log " Max: $( $ernSummary.max )"
# Remove the file
Write-Log "Removing the temporary file at '$( $tempFile )'"
Remove-Item -Path $tempFile
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment