Created
June 13, 2025 09:37
-
-
Save gitfvb/f920856818d3c0b585baa400d4e18873 to your computer and use it in GitHub Desktop.
Write the last part of a file into a temporary file and use that with DuckDB
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################ | |
# | |
# INPUT | |
# | |
################################################ | |
#----------------------------------------------- | |
# DEBUG SWITCH | |
#----------------------------------------------- | |
$debug = $false | |
If ( $PSBoundParameters["Debug"].IsPresent -eq $true ) { | |
$debug = $true | |
} | |
#----------------------------------------------- | |
# INPUT PARAMETERS, IF DEBUG IS TRUE | |
#----------------------------------------------- | |
if ( $debug -eq $true ) { | |
$params = [hashtable]@{ | |
# Automatic parameters | |
Password = 'ko' | |
Username = 'ko' | |
#mode='taggingOnly' | |
# Integration parameters | |
Force64bit = "true" | |
#ForceCore = "true" | |
#ForcePython = "true" | |
#UseJob = "true" | |
settingsFile = "C:\FastStats\Scripts\AptecoPSFramework\settings\sfnpc2\settings.yaml" | |
} | |
} | |
################################################ | |
# | |
# SETTINGS | |
# | |
################################################ | |
$settingsfileLocation = "C:\faststats\scripts\build\settings.json" | |
#----------------------------------------------- | |
# CHANGE PATH | |
#----------------------------------------------- | |
# Set current location to the settings files directory | |
$settingsFileItem = Get-Item $settingsfileLocation | |
Set-Location $settingsFileItem.DirectoryName | |
Import-Module WriteLog | |
Set-Logfile "E:\temp\duck.log" | |
################################################ | |
# | |
# PATH | |
# | |
################################################ | |
#----------------------------------------------- | |
# CHECKING PS AND OS | |
#----------------------------------------------- | |
Write-Verbose "Check PowerShell and Operating system" -Verbose | |
# Check if this is Pwsh Core | |
$isCore = ($PSVersionTable.Keys -contains "PSEdition") -and ($PSVersionTable.PSEdition -ne 'Desktop') | |
Write-Verbose -Message "Using PowerShell version $( $PSVersionTable.PSVersion.ToString() ) and $( $PSVersionTable.PSEdition ) edition" -Verbose | |
# Check the operating system, if Core | |
if ($isCore -eq $true) { | |
$os = If ( $IsWindows -eq $true ) { | |
"Windows" | |
} elseif ( $IsLinux -eq $true ) { | |
"Linux" | |
} elseif ( $IsMacOS -eq $true ) { | |
"MacOS" | |
} else { | |
throw "Unknown operating system" | |
} | |
} else { | |
# [System.Environment]::OSVersion.VersionString() | |
# [System.Environment]::Is64BitOperatingSystem | |
$os = "Windows" | |
} | |
Write-Verbose -Message "Using OS: $( $os )" -Verbose | |
#----------------------------------------------- | |
# ADD MODULE PATH, IF NOT PRESENT | |
#----------------------------------------------- | |
$modulePath = @( [System.Environment]::GetEnvironmentVariable("PSModulePath") -split ";" ) + @( | |
"$( [System.Environment]::GetEnvironmentVariable("ProgramFiles") )\WindowsPowerShell\Modules" | |
"$( [System.Environment]::GetEnvironmentVariable("ProgramFiles(x86)") )\WindowsPowerShell\Modules" | |
"$( [System.Environment]::GetEnvironmentVariable("USERPROFILE") )\Documents\WindowsPowerShell\Modules" | |
"$( [System.Environment]::GetEnvironmentVariable("windir") )\system32\WindowsPowerShell\v1.0\Modules" | |
) | |
# Add the 64bit path, if present. In 32bit the ProgramFiles variables only returns the x86 path | |
If ( [System.Environment]::GetEnvironmentVariables().keys -contains "ProgramW6432" ) { | |
$modulePath += "$( [System.Environment]::GetEnvironmentVariable("ProgramW6432") )\WindowsPowerShell\Modules" | |
} | |
# Add pwsh core path | |
If ( $isCore -eq $true ) { | |
If ( [System.Environment]::GetEnvironmentVariables().keys -contains "ProgramW6432" ) { | |
$modulePath += "$( [System.Environment]::GetEnvironmentVariable("ProgramW6432") )\powershell\7\Modules" | |
} | |
$modulePath += "$( [System.Environment]::GetEnvironmentVariable("ProgramFiles") )\powershell\7\Modules" | |
$modulePath += "$( [System.Environment]::GetEnvironmentVariable("ProgramFiles(x86)") )\powershell\7\Modules" | |
} | |
# Add all paths | |
# Using $env:PSModulePath for only temporary override | |
$Env:PSModulePath = @( $modulePath | Sort-Object -unique ) -join ";" | |
#----------------------------------------------- | |
# ADD SCRIPT PATH, IF NOT PRESENT | |
#----------------------------------------------- | |
#$envVariables = [System.Environment]::GetEnvironmentVariables() | |
$scriptPath = @( [System.Environment]::GetEnvironmentVariable("Path") -split ";" ) + @( | |
"$( [System.Environment]::GetEnvironmentVariable("ProgramFiles") )\WindowsPowerShell\Scripts" | |
"$( [System.Environment]::GetEnvironmentVariable("ProgramFiles(x86)") )\WindowsPowerShell\Scripts" | |
"$( [System.Environment]::GetEnvironmentVariable("USERPROFILE") )\Documents\WindowsPowerShell\Scripts" | |
) | |
# Add the 64bit path, if present. In 32bit the ProgramFiles variables only returns the x86 path | |
If ( [System.Environment]::GetEnvironmentVariables().keys -contains "ProgramW6432" ) { | |
$scriptPath += "$( [System.Environment]::GetEnvironmentVariable("ProgramW6432") )\WindowsPowerShell\Scripts" | |
} | |
# Add pwsh core path | |
If ( $isCore -eq $true ) { | |
If ( [System.Environment]::GetEnvironmentVariables().keys -contains "ProgramW6432" ) { | |
$scriptPath += "$( [System.Environment]::GetEnvironmentVariable("ProgramW6432") )\powershell\7\Scripts" | |
} | |
$scriptPath += "$( [System.Environment]::GetEnvironmentVariable("ProgramFiles") )\powershell\7\Scripts" | |
$scriptPath += "$( [System.Environment]::GetEnvironmentVariable("ProgramFiles(x86)") )\powershell\7\Scripts" | |
} | |
# Using $env:Path for only temporary override | |
$Env:Path = @( $scriptPath | Sort-Object -unique ) -join ";" | |
################################################ | |
# | |
# CHECKS | |
# | |
################################################ | |
#----------------------------------------------- | |
# DEFAULT VALUES | |
#----------------------------------------------- | |
$useJob = $false | |
$enforce64Bit = $false | |
$enforceCore = $false | |
$enforcePython = $false | |
$isPsCoreInstalled = $false | |
#----------------------------------------------- | |
# IMPORT MODULE | |
#----------------------------------------------- | |
If ($debug -eq $true) { | |
Import-Module "AptecoPSFramework" -Verbose | |
} else { | |
Import-Module "AptecoPSFramework" | |
} | |
#----------------------------------------------- | |
# SET DEBUG MODE | |
#----------------------------------------------- | |
Set-DebugMode -DebugMode $debug | |
#----------------------------------------------- | |
# SET SETTINGS | |
#----------------------------------------------- | |
# Set the settings | |
#If ( $useJob -eq $true -and $ProcessId -ne "") { | |
# Import-Settings -Path $settingsfileLocation -ProcessId $ProcessId | |
#} else { | |
# Import-Settings -Path $settingsfileLocation | |
#} | |
# Get all settings | |
$s = Get-Settings | |
#----------------------------------------------- | |
# FIND OUT ABOUT PS CORE | |
#----------------------------------------------- | |
try { | |
$calc = . $s.psCoreExePath { 1+1 } | |
} catch { | |
# just a test, nothing to do | |
} | |
if ( $calc -eq 2 ) { | |
$isPsCoreInstalled = $true | |
} | |
#----------------------------------------------- | |
# FIND OUT THE MODE | |
#----------------------------------------------- | |
$mode = "function" | |
If ( $enforce64Bit -eq $true ) { | |
$mode = "PSWin64" | |
} elseif ( $enforceCore -eq $true ) { | |
$mode = "PSCore" | |
} elseif ( $enforcePython -eq $true ) { | |
$mode = "Python" | |
} | |
import-lib | |
Open-DuckDBConnection | |
Read-DuckDBQueryAsScalar -Query "Select 10+15" | |
#Import-Dependencies -LoadWholePackageFolder -LocalPackageFolder ".\lib" -SuppressWarnings #$localLibFolderItem.name | |
$path = "E:\FastStats\Build\test\Extract\lineitems.txt" | |
$tempFile = "E:\temp\$( [guid]::NewGuid().toString() ).csv" | |
$sniffRecords = 10000000 | |
$ernColumn = "ERNTS" | |
# Write the last n lines into a new file | |
# 100k rows need 2 seconds | |
# 1m rows need 18 seconds | |
# 10m rows need 186 seconds | |
Write-Log "Writing the last $( $sniffRecords ) rows of '$( $path )' into '$( $tempFile )'" | |
$t = Measure-Command { | |
Get-Content -totalcount 1 -Encoding UTF8 -path $path | Set-content -Path $tempFile -encoding utf8 | |
Get-Content -Tail $sniffRecords -Encoding UTF8 -path $path -ReadCount 1000 | Add-Content -Path $tempFile -Encoding UTF8 | |
} | |
Write-Log "Needed $( $t.TotalSeconds ) to write '$( $tempFile )'" | |
# Collect data about that file | |
# 10m rows need about 21 seconds | |
Write-Log "Reading the temporary file '$( $tempFile )'" | |
$t = Measure-Command { | |
$summary = Read-DuckDBQueryAsReader -Query "SUMMARIZE TABLE '$( $tempFile )'" -ReturnAsPSCustom | |
} | |
Write-Log "Needed $( $t.TotalSeconds ) to summarize file '$( $tempFile )'" | |
# Checking the column | |
Write-Log "Checking the column '$( $ernColumn )' for the max value" | |
$ernSummary = $summary | where-object { $_."column_name" -eq $ernColumn } | |
Write-Log " Min: $( $ernSummary.min )" | |
Write-Log " Max: $( $ernSummary.max )" | |
# Remove the file | |
Write-Log "Removing the temporary file at '$( $tempFile )'" | |
Remove-Item -Path $tempFile | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment